Natural Language Processing

Extract people from Guardian reporting on what we know on Ukraine.
NLP
Author

Luke Heley

Published

May 18, 2023

Code
df <- readr::read_csv("2023-05-18-guardian-ukraine-what-we-know.csv",
                      col_types = readr::cols())
Code
people <- purrr::map(df$fields.body,~{
  text <- .x |>
    xml2::read_html() |>
    xml2::xml_find_all("//p") |>
    xml2::xml_text()
    
  s <- NLP::String(paste(text, collapse = "\n"))

  ## Need sentence and word token annotations.
  sent_token_annotator <- openNLP::Maxent_Sent_Token_Annotator()
  word_token_annotator <- openNLP::Maxent_Word_Token_Annotator()
  a2 <- NLP::annotate(s, list(sent_token_annotator, word_token_annotator))

## Entity recognition for persons.
  entity_annotator <- openNLP::Maxent_Entity_Annotator()

  s[entity_annotator(s, a2)]
})
Code
org <- purrr::map(df$fields.body,~{
  text <- .x |>
    xml2::read_html() |>
    xml2::xml_find_all("//p") |>
    xml2::xml_text()
    
  s <- NLP::String(paste(text, collapse = "\n"))

  ## Need sentence and word token annotations.
  sent_token_annotator <- openNLP::Maxent_Sent_Token_Annotator()
  word_token_annotator <- openNLP::Maxent_Word_Token_Annotator()
  a2 <- NLP::annotate(s, list(sent_token_annotator, word_token_annotator))

## Entity recognition for persons.
  entity_annotator <- openNLP::Maxent_Entity_Annotator(kind = "organization")

  s[entity_annotator(s, a2)]
}, .progress = TRUE)
Code
loc <- purrr::map(df$fields.body,~{
  text <- .x |>
    xml2::read_html() |>
    xml2::xml_find_all("//p") |>
    xml2::xml_text()
    
  s <- NLP::String(paste(text, collapse = "\n"))

  ## Need sentence and word token annotations.
  sent_token_annotator <- openNLP::Maxent_Sent_Token_Annotator()
  word_token_annotator <- openNLP::Maxent_Word_Token_Annotator()
  a2 <- NLP::annotate(s, list(sent_token_annotator, word_token_annotator))

## Entity recognition for persons.
  entity_annotator <- openNLP::Maxent_Entity_Annotator(kind = "location")

  s[entity_annotator(s, a2)]
}, .progress = TRUE)