Process Guardian - What we know on Ukraine

This post uses API to extract Guardian What we Know data and LexR to extract top 3 points.
api
NLP
Author

Luke Heley

Published

May 18, 2023

Code
api_key <- Sys.getenv("GUARDIAN_API")

base <- "https://content.guardianapis.com/"
path <- "search"
query = list(
  tag = "world/series/russia-ukraine-war-at-a-glance",
  `api-key` = api_key,
  `page-size` = 50,
  `show-fields` = "trailText,headline,body"
)

req <- httr::GET(base, path = path, query = query)
back <- httr::content(req)

page1 <- back$response$results |>
  purrr::map_df(~t(unlist(.x)) |> dplyr::as_tibble())

pages <- back$response$pages
other_pages <- purrr::map(2:pages, ~{
  query$page = .x
  req <- httr::GET(base, path = path, query = query)
  back <- httr::content(req)
   back$response$results |>
    purrr::map_df(~t(unlist(.x)) |> dplyr::as_tibble())
})

pages <- page1 |> dplyr::bind_rows(other_pages |>
  dplyr::bind_rows())
#write.csv()
Code
pages <- readr::read_csv("2023-05-18-guardian-ukraine-what-we-know.csv")
Rows: 440 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (12): id, type, sectionId, sectionName, webTitle, webUrl, apiUrl, field...
lgl   (1): isHosted
dttm  (1): webPublicationDate

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
process_text <- function(x){
  xml2:::read_html(x) |>
    xml2::xml_find_all("//p") |>
    xml2::xml_text() |>
    stringr::str_split("\n") |>
    unlist() |>
    stringr::str_trim() |>
    stringi::stri_remove_empty()
}

body <- purrr::map(pages$fields.body, ~process_text(.x)) 
Code
top3 <- purrr::map(body, ~{
  lexRankr::lexRank(.x,
                    # only 1 article; repeat same docid for all of input vector
                    docId = rep(1, length(.x)),
                    # return 3 sentences
                    n = 3,
                    continuous = TRUE, 
                    Verbose = FALSE)
})

top3_sent <- purrr::map(top3, ~.x$sentence)
Code
# Summarise
pages |>
  dplyr::mutate(fields.headline = stringr::str_trim(fields.headline)) |>
  dplyr::mutate(day = stringr::str_extract(fields.headline, "day [0-9]{1,3}")) |> 
  dplyr::mutate(day = stringr::str_remove_all(day, "day ") |> as.numeric()) |>
  dplyr::select(webPublicationDate, day,fields.trailText) |>
  reactable::reactable(
    details = function(index){
      top3_i <- data.frame(top3_sent[[index]], stringsAsFactors = FALSE)
      tbl <- reactable::reactable(top3_i, outlined = TRUE, highlight = TRUE, fullWidth = TRUE)
      htmltools::div(style = list(margin = "12px 45px"), tbl)
    },
    onClick = "expand",
    rowStyle = list(cursor = "pointer")
  )
Code
#write.csv(pages, paste0(Sys.Date(), "-guardian-ukraine-what-we-know.csv"), row.names = FALSE)