# RFPP Batch Bot
# Moves protected articles from "Active requests" to "Handled requests"
# AI/LLM use disclosure [[WP:LLMDISCLOSE]]
# I used Claude by Anthropic to help with the regex,
# making the console output look pretty, and some of
# the comments explaining what's going on. -DV 2024-04-20
library(httr2)
library(tidyverse)
library(glue)
# config and set up
BOT_CONFIG <- list(
api_url = "https://en.wikipedia.org/w/api.php",
username = Sys.getenv("WP_BOT_USERNAME"),
password = Sys.getenv("WP_BOT_PASSWORD"),
batch_page = "Wikipedia:Requests for page protection/Batch",
user_agent = "VulpesBot/1.0 Task 5 RFPP Batch Mover (https://en.wikipedia.org/wiki/User:VulpesBot)",
cookie_jar = file.path(tempdir(), "wp_cookies.txt"),
# I added a dry run because I'm really bad with working with strings
# TRUE one does parse + report only, no edit submitted
# FALSE parse + edit
dry_run = FALSE
)
stopifnot(
"WP_BOT_USERNAME not set in .Renviron" = nchar(BOT_CONFIG$username) > 0,
"WP_BOT_PASSWORD not set in .Renviron" = nchar(BOT_CONFIG$password) > 0
)
# request
wp_req <- function(...) {
request(BOT_CONFIG$api_url) |>
req_url_query(format = "json", formatversion = "2", ...) |>
req_cookie_preserve(BOT_CONFIG$cookie_jar) |>
req_user_agent(BOT_CONFIG$user_agent) |>
req_retry(max_tries = 3, backoff = ~ 5)
}
#auth
login <- function() {
# Clear any stale session
if (file.exists(BOT_CONFIG$cookie_jar))
file.remove(BOT_CONFIG$cookie_jar)
# get a new login token, not a cached token
login_token <- wp_req(action = "query",
meta = "tokens",
type = "login") |>
req_perform() |>
resp_body_json() |>
pluck("query", "tokens", "logintoken")
# POST credentials
result <- wp_req(action = "login") |>
req_method("POST") |>
req_body_form(
lgname = BOT_CONFIG$username,
lgpassword = BOT_CONFIG$password,
lgtoken = login_token
) |>
req_perform() |>
resp_body_json()
# this is so it looks pretty in the console
if (result$login$result != "Success") {
stop(
"Login failed: ",
result$login$result,
"\nReason: ",
result$login$reason %||% "(none)"
)
}
message("Authenticated as: ", result$login$lgusername)
invisible(result)
}
get_csrf_token <- function() {
wp_req(action = "query", meta = "tokens") |>
req_perform() |>
resp_body_json() |>
pluck("query", "tokens", "csrftoken")
}
# fetch the page
fetch_wikitext <- function(title) {
resp <- wp_req(
action = "query",
titles = title,
prop = "revisions",
rvprop = "content",
rvslots = "main"
) |>
req_perform() |>
resp_body_json()
content <- resp |>
pluck("query", "pages", 1, "revisions", 1, "slots", "main", "content")
if (is.null(content))
stop("Could not retrieve wikitext for: ", title)
content
}
# regex helper
regex_escape <- function(x) {
str_replace_all(x, r"([\\^$.|?*+()\[\]{}])", r"(\\\0)")
}
# parse the active articles in [[WP:BATCH]]
# returns a tibble with columns: title, raw_line, subsection
# Walks every line under === Active requests ===, tracking ==== subsection ====
# headers so each article knows which handled subsection it belongs to.
parse_active_articles <- function(wikitext) {
# Capture the entire Active requests section (up to Handled requests)
active_block <- str_match(wikitext,
r"((?s)=== Active requests ===\n(.*?)=== Handled requests ===)")[, 2]
if (is.na(active_block)) {
warning("Could not locate the === Active requests === section")
return(tibble(
title = character(),
raw_line = character(),
subsection = character()
))
}
lines <- str_split(active_block, "\n")[[1]]
# walking down the lines of [[WP:BATCH]]
current_subsection <- NA_character_
rows <- list()
for (line in lines) {
# goes by headers down the wikitext
if (str_detect(line, r"(^====\s*.+\s*====\s*$)")) {
current_subsection <- str_trim(str_remove_all(line, "="))
next
}
# only use a link if it contains a {{pagelinks|...}} template
if (!str_detect(line, r"(\{\{pagelinks\|)"))
next
title <- str_match(line, r"(\{\{pagelinks\|(.+?)\}\})")[, 2]
# skip missing titles and the template placeholder
if (is.na(title) || title == "PAGE TITLE HERE")
next
rows[[length(rows) + 1]] <- tibble(raw_line = line,
title = title,
subsection = current_subsection)
}
if (length(rows) == 0) {
return(tibble(
title = character(),
raw_line = character(),
subsection = character()
))
}
list_rbind(rows)
}
# check if article is really protected
query_protection_api <- function(titles) {
resp <- wp_req(
action = "query",
titles = paste(titles, collapse = "|"),
prop = "info",
inprop = "protection"
) |>
req_perform() |>
resp_body_json()
resp |>
pluck("query", "pages") |>
map(\(page) {
protections <- page$protection %||% list()
edit_prot <- keep(protections, \(p) p$type == "edit")
tibble(
title = page$title,
is_protected = length(edit_prot) > 0,
prot_level = if (length(edit_prot) > 0)
edit_prot[[1]]$level
else
NA_character_
)
}) |>
list_rbind()
}
# check to see if there is a protection template present in the article wikitext.
has_protection_template <- function(title) {
text <- tryCatch(
fetch_wikitext(title),
error = \(e) NULL
)
if (is.null(text))
return(FALSE)
str_detect(text, r"(\{\{[Pp][Pp][-–])")
}
# run both checks
check_all_articles <- function(titles) {
# Batch the API calls in chunks of 50
api_results <- titles |>
split(ceiling(seq_along(titles) / 50)) |>
map(query_protection_api) |>
list_rbind()
# only bother getting wikitext for articles that are protected
api_results |>
mutate(
has_template = map2_lgl(title, is_protected, \(t, prot) {
if (prot)
has_protection_template(t)
else
FALSE
}),
ready = is_protected & has_template
)
}
# wikitext formater
# articles from each ==== subsection ==== in Active requests are inserted into
# the matching ==== subsection ==== in Handled requests. For now there is only
# one but there may be others added in the future.
move_to_handled <- function(wikitext, ready) {
if (nrow(ready) == 0)
return(wikitext)
new_wikitext <- wikitext
# Remove moved article lines from Active requests
all_titles <- ready$title
esc_titles <- regex_escape(all_titles)
removal_pattern <- paste0(
r"(:?\*\s*\{\{pagelinks\|(?:)",
paste(esc_titles, collapse = "|"),
r"()\}\}[^\n]*\n?)"
)
new_wikitext <- str_remove_all(new_wikitext, removal_pattern)
# clean up submitter headers
new_wikitext <- str_remove_all(new_wikitext,
r"((?m)^\* From \[\[.*?\]\].*?\n(?=\* From |\n*<!-- =))")
# insert entries into the matching Handled subsection
# split by both sections so R doesn't process things wrong
split_point <- "=== Handled requests ==="
split_pos <- str_locate(new_wikitext, fixed(split_point))[, "start"]
active_half <- str_sub(new_wikitext, 1, split_pos - 1)
handled_half <- str_sub(new_wikitext, split_pos)
ready |>
group_by(subsection) |>
group_walk(\(df, grp) {
sec <- grp$subsection
new_entries <- df$raw_line |>
map_chr(\(ln) paste0(ln, "")) |>
paste(collapse = "\n")
# look for this header pattern
header_pattern <- paste0(r"((==== )", esc_sec, r"( ====\n))")
handled_half <<- str_replace(handled_half,
header_pattern,
paste0("\\1", new_entries, "\n"))
})
new_wikitext <- paste0(active_half, handled_half)
new_wikitext
}
# diff
# this is a helper function that prints a simple added/removed
# line diff for dry-run review
show_diff <- function(old, new) {
old_lines <- str_split(old, "\n")[[1]]
new_lines <- str_split(new, "\n")[[1]]
removed <- setdiff(old_lines, new_lines)
added <- setdiff(new_lines, old_lines)
# I used AI to make this pretty for the console
if (length(removed) > 0) {
cat("\n── Removed lines ────────────────────────────────\n")
cat(paste0("- ", removed), sep = "\n")
}
if (length(added) > 0) {
cat("\n── Added lines ──────────────────────────────────\n")
cat(paste0("+ ", added), sep = "\n")
}
}
# bot
run_rfpp_bot <- function() {
cat("══ RFPP Batch Bot ══════════════════════════════════════════════\n")
cat(glue("Dry run: {BOT_CONFIG$dry_run}\n\n"))
# auth
login()
csrf_token <- if (!BOT_CONFIG$dry_run)
get_csrf_token()
else
NULL
# fetch current batch page
message("Fetching batch page wikitext...")
wikitext <- fetch_wikitext(BOT_CONFIG$batch_page)
# parse active articles
active <- parse_active_articles(wikitext)
message(glue("Found {nrow(active)} active article(s) in the queue\n"))
if (nrow(active) == 0) {
message("Queue is empty — nothing to do.")
return(invisible(NULL))
}
walk(active$title, \(t) message(" • ", t))
# check protection status and template
message("\nChecking protection status")
status <- check_all_articles(active$title)
status |>
mutate(
status_label = case_when(
ready ~ "READY (protected + template)",
is_protected ~ "API protected, no template yet",
TRUE ~ "not yet protected"
)
) |>
select(title, prot_level, has_template, status_label) |>
print(n = Inf)
ready_titles <- status |>
filter(ready) |>
pull(title)
if (length(ready_titles) == 0) {
message("\nNo articles are ready to move yet.")
return(invisible(status))
}
message(glue(
"\n{length(ready_titles)} article(s) ready to move to Handled."
))
# join protection status
ready_df <- active |>
inner_join(filter(status, ready), by = "title") |>
select(title, raw_line, subsection)
# rewrite the wikitext
new_wikitext <- move_to_handled(wikitext, ready_df)
# dry_run = TRUE will show diff and return
# dry_run = FALSE will submit edit to the page
if (BOT_CONFIG$dry_run) {
message("\n── DRY RUN diff ──────────────────────────────────────────────")
show_diff(wikitext, new_wikitext)
message("\nSet BOT_CONFIG$dry_run <- FALSE to submit the edit.")
return(invisible(list(
old = wikitext,
new = new_wikitext,
status = status
)))
}
message("\nSubmitting edit")
edit_resp <- request(BOT_CONFIG$api_url) |>
req_cookie_preserve(BOT_CONFIG$cookie_jar) |>
req_user_agent(BOT_CONFIG$user_agent) |>
req_method("POST") |>
req_body_form(
action = "edit",
format = "json",
title = BOT_CONFIG$batch_page,
text = new_wikitext,
summary = glue(
"Moving {length(ready_titles)} article(s) from active to handled"
),
bot = "1",
minor = "1",
token = csrf_token
) |>
req_perform() |>
resp_body_json()
if (identical(edit_resp$edit$result, "Success")) {
message(glue("✓ Edit saved — revid {edit_resp$edit$newrevid}"))
} else {
warning(
"Edit response unexpected:\n",
jsonlite::toJSON(edit_resp, auto_unbox = TRUE, pretty = TRUE)
)
}
invisible(edit_resp)
}
# run
run_rfpp_bot()