User:Dr vulpes/code/R/rfpp batch bot code.R

# RFPP Batch Bot
# Moves protected articles from "Active requests" to "Handled requests"

# AI/LLM use disclosure [[WP:LLMDISCLOSE]]
# I used Claude by Anthropic to help with the regex, 
# making the console output look pretty, and some of 
# the comments explaining what's going on. -DV 2024-04-20

library(httr2)
library(tidyverse)
library(glue)

# config and set up
BOT_CONFIG <- list(
  api_url    = "https://en.wikipedia.org/w/api.php",
  username   = Sys.getenv("WP_BOT_USERNAME"),
  password   = Sys.getenv("WP_BOT_PASSWORD"),
  batch_page = "Wikipedia:Requests for page protection/Batch",
  user_agent = "VulpesBot/1.0 Task 5 RFPP Batch Mover (https://en.wikipedia.org/wiki/User:VulpesBot)",
  cookie_jar = file.path(tempdir(), "wp_cookies.txt"),
  
  # I added a dry run because I'm really bad with working with strings
  # TRUE one does parse + report only, no edit submitted
  # FALSE parse + edit
  dry_run    = FALSE
)

stopifnot(
  "WP_BOT_USERNAME not set in .Renviron" = nchar(BOT_CONFIG$username) > 0,
  "WP_BOT_PASSWORD not set in .Renviron" = nchar(BOT_CONFIG$password) > 0
)

# request
wp_req <- function(...) {
  request(BOT_CONFIG$api_url) |>
    req_url_query(format = "json", formatversion = "2", ...) |>
    req_cookie_preserve(BOT_CONFIG$cookie_jar) |>
    req_user_agent(BOT_CONFIG$user_agent) |>
    req_retry(max_tries = 3, backoff = ~ 5)
}

#auth
login <- function() {
  # Clear any stale session
  if (file.exists(BOT_CONFIG$cookie_jar))
    file.remove(BOT_CONFIG$cookie_jar)
  
  # get a new login token, not a cached token
  login_token <- wp_req(action = "query",
                        meta = "tokens",
                        type = "login") |>
    req_perform() |>
    resp_body_json() |>
    pluck("query", "tokens", "logintoken")
  
  # POST credentials
  result <- wp_req(action = "login") |>
    req_method("POST") |>
    req_body_form(
      lgname     = BOT_CONFIG$username,
      lgpassword = BOT_CONFIG$password,
      lgtoken    = login_token
    ) |>
    req_perform() |>
    resp_body_json()
  
  # this is so it looks pretty in the console
  if (result$login$result != "Success") {
    stop(
      "Login failed: ",
      result$login$result,
      "\nReason: ",
      result$login$reason %||% "(none)"
    )
  }
  message("Authenticated as: ", result$login$lgusername)
  invisible(result)
}

get_csrf_token <- function() {
  wp_req(action = "query", meta = "tokens") |>
    req_perform() |>
    resp_body_json() |>
    pluck("query", "tokens", "csrftoken")
}

# fetch the page
fetch_wikitext <- function(title) {
  resp <- wp_req(
    action  = "query",
    titles  = title,
    prop    = "revisions",
    rvprop  = "content",
    rvslots = "main"
  ) |>
    req_perform() |>
    resp_body_json()
  
  content <- resp |>
    pluck("query", "pages", 1, "revisions", 1, "slots", "main", "content")
  
  if (is.null(content))
    stop("Could not retrieve wikitext for: ", title)
  content
}

# regex helper

regex_escape <- function(x) {
  str_replace_all(x, r"([\\^$.|?*+()\[\]{}])", r"(\\\0)")
}

# parse the active articles in [[WP:BATCH]]
# returns a tibble with columns: title, raw_line, subsection
# Walks every line under === Active requests ===, tracking ==== subsection ====
# headers so each article knows which handled subsection it belongs to.


parse_active_articles <- function(wikitext) {
  # Capture the entire Active requests section (up to Handled requests)
  active_block <- str_match(wikitext,
                            r"((?s)=== Active requests ===\n(.*?)=== Handled requests ===)")[, 2]
  
  if (is.na(active_block)) {
    warning("Could not locate the === Active requests === section")
    return(tibble(
      title = character(),
      raw_line = character(),
      subsection = character()
    ))
  }
  
  lines <- str_split(active_block, "\n")[[1]]
  
  # walking down the lines of [[WP:BATCH]]
  current_subsection <- NA_character_
  rows <- list()
  
  for (line in lines) {
     # goes by headers down the wikitext
    if (str_detect(line, r"(^====\s*.+\s*====\s*$)")) {
      current_subsection <- str_trim(str_remove_all(line, "="))
      next
    }
    
    # only use a link if it contains a {{pagelinks|...}} template
    if (!str_detect(line, r"(\{\{pagelinks\|)"))
      next
    
    title <- str_match(line, r"(\{\{pagelinks\|(.+?)\}\})")[, 2]
    
    # skip missing titles and the template placeholder
    if (is.na(title) || title == "PAGE TITLE HERE")
      next
    
    rows[[length(rows) + 1]] <- tibble(raw_line   = line,
                                       title      = title,
                                       subsection = current_subsection)
  }
  
  if (length(rows) == 0) {
    return(tibble(
      title = character(),
      raw_line = character(),
      subsection = character()
    ))
  }
  
  list_rbind(rows)
}

# check if article is really protected
query_protection_api <- function(titles) {
  resp <- wp_req(
    action  = "query",
    titles  = paste(titles, collapse = "|"),
    prop    = "info",
    inprop  = "protection"
  ) |>
    req_perform() |>
    resp_body_json()
  
  resp |>
    pluck("query", "pages") |>
    map(\(page) {
      protections <- page$protection %||% list()
      edit_prot   <- keep(protections, \(p) p$type == "edit")
      tibble(
        title        = page$title,
        is_protected = length(edit_prot) > 0,
        prot_level   = if (length(edit_prot) > 0)
          edit_prot[[1]]$level
        else
          NA_character_
      )
    }) |>
    list_rbind()
}

# check to see if there is a protection template present in the article wikitext.

has_protection_template <- function(title) {
  text <- tryCatch(
    fetch_wikitext(title),
    error = \(e) NULL
  )
  if (is.null(text))
    return(FALSE)
  str_detect(text, r"(\{\{[Pp][Pp][-–])")
}

# run both checks
check_all_articles <- function(titles) {
  # Batch the API calls in chunks of 50
  api_results <- titles |>
    split(ceiling(seq_along(titles) / 50)) |>
    map(query_protection_api) |>
    list_rbind()
  
  # only bother getting wikitext for articles that are protected
  api_results |>
    mutate(
      has_template = map2_lgl(title, is_protected, \(t, prot) {
        if (prot)
          has_protection_template(t)
        else
          FALSE
      }),
      ready = is_protected & has_template
    )
}

# wikitext formater
# articles from each ==== subsection ==== in Active requests are inserted into 
# the matching ==== subsection ==== in Handled requests. For now there is only
# one but there may be others added in the future. 

move_to_handled <- function(wikitext, ready) {
  if (nrow(ready) == 0)
    return(wikitext)
  
  new_wikitext <- wikitext
  
  # Remove moved article lines from Active requests
  all_titles <- ready$title
  esc_titles <- regex_escape(all_titles)
  
  removal_pattern <- paste0(
    r"(:?\*\s*\{\{pagelinks\|(?:)",
    paste(esc_titles, collapse = "|"),
    r"()\}\}[^\n]*\n?)"
  )
  new_wikitext <- str_remove_all(new_wikitext, removal_pattern)
  
  # clean up submitter headers 
  new_wikitext <- str_remove_all(new_wikitext,
                                 r"((?m)^\* From \[\[.*?\]\].*?\n(?=\* From |\n*<!-- =))")
  
  # insert entries into the matching Handled subsection
  # split by both sections so R doesn't process things wrong
  split_point <- "=== Handled requests ==="
  split_pos   <- str_locate(new_wikitext, fixed(split_point))[, "start"]
  active_half  <- str_sub(new_wikitext, 1, split_pos - 1)
  handled_half <- str_sub(new_wikitext, split_pos)
  
  ready |>
    group_by(subsection) |>
    group_walk(\(df, grp) {
      sec <- grp$subsection
      
      new_entries <- df$raw_line |>
        map_chr(\(ln) paste0(ln, "")) |>
        paste(collapse = "\n")
      
      # look for this header pattern
      header_pattern <- paste0(r"((==== )", esc_sec, r"( ====\n))")
      
      handled_half <<- str_replace(handled_half,
                                   header_pattern,
                                   paste0("\\1", new_entries, "\n"))
    })
  
  new_wikitext <- paste0(active_half, handled_half)
  
  new_wikitext
}

# diff 
# this is a helper function that prints a simple added/removed 
# line diff for dry-run review

show_diff <- function(old, new) {
  old_lines <- str_split(old, "\n")[[1]]
  new_lines <- str_split(new, "\n")[[1]]
  removed   <- setdiff(old_lines, new_lines)
  added     <- setdiff(new_lines, old_lines)
  # I used AI to make this pretty for the console
  if (length(removed) > 0) {
    cat("\n── Removed lines ────────────────────────────────\n")
    cat(paste0("- ", removed), sep = "\n")
  }
  if (length(added) > 0) {
    cat("\n── Added lines ──────────────────────────────────\n")
    cat(paste0("+ ", added), sep = "\n")
  }
}

# bot

run_rfpp_bot <- function() {
  cat("══ RFPP Batch Bot ══════════════════════════════════════════════\n")
  cat(glue("Dry run: {BOT_CONFIG$dry_run}\n\n"))
  
  # auth
  login()
  csrf_token <- if (!BOT_CONFIG$dry_run)
    get_csrf_token()
  else
    NULL
  
  # fetch current batch page
  message("Fetching batch page wikitext...")
  wikitext <- fetch_wikitext(BOT_CONFIG$batch_page)
  
  # parse active articles
  active <- parse_active_articles(wikitext)
  message(glue("Found {nrow(active)} active article(s) in the queue\n"))
  
  if (nrow(active) == 0) {
    message("Queue is empty — nothing to do.")
    return(invisible(NULL))
  }
  
  walk(active$title, \(t) message("  • ", t))
  
  # check protection status and template
  message("\nChecking protection status")
  status <- check_all_articles(active$title)
  
  status |>
    mutate(
      status_label = case_when(
        ready        ~ "READY (protected + template)",
        is_protected ~ "API protected, no template yet",
        TRUE         ~ "not yet protected"
      )
    ) |>
    select(title, prot_level, has_template, status_label) |>
    print(n = Inf)
  
  ready_titles <- status |>
    filter(ready) |>
    pull(title)
  
  if (length(ready_titles) == 0) {
    message("\nNo articles are ready to move yet.")
    return(invisible(status))
  }
  
  message(glue(
    "\n{length(ready_titles)} article(s) ready to move to Handled."
  ))
  
  # join protection status 
  ready_df <- active |>
    inner_join(filter(status, ready), by = "title") |>
    select(title, raw_line, subsection)
  
  # rewrite the wikitext
  new_wikitext <- move_to_handled(wikitext, ready_df)
  
  # dry_run = TRUE will show diff and return
  # dry_run = FALSE will submit edit to the page
  if (BOT_CONFIG$dry_run) {
    message("\n── DRY RUN diff ──────────────────────────────────────────────")
    show_diff(wikitext, new_wikitext)
    message("\nSet BOT_CONFIG$dry_run <- FALSE to submit the edit.")
    return(invisible(list(
      old = wikitext,
      new = new_wikitext,
      status = status
    )))
  }
  
  message("\nSubmitting edit")
  edit_resp <- request(BOT_CONFIG$api_url) |>
    req_cookie_preserve(BOT_CONFIG$cookie_jar) |>
    req_user_agent(BOT_CONFIG$user_agent) |>
    req_method("POST") |>
    req_body_form(
      action  = "edit",
      format  = "json",
      title   = BOT_CONFIG$batch_page,
      text    = new_wikitext,
      summary = glue(
        "Moving {length(ready_titles)} article(s) from active to handled"
      ),
      bot     = "1",
      minor   = "1",
      token   = csrf_token
    ) |>
    req_perform() |>
    resp_body_json()
  
  if (identical(edit_resp$edit$result, "Success")) {
    message(glue("✓ Edit saved — revid {edit_resp$edit$newrevid}"))
  } else {
    warning(
      "Edit response unexpected:\n",
      jsonlite::toJSON(edit_resp, auto_unbox = TRUE, pretty = TRUE)
    )
  }
  
  invisible(edit_resp)
}

# run
run_rfpp_bot()