Using Wiki Data to Extract Data

This post shows how to access the wikipedia knowledge graph
wikipedia
scrape
Author

Luke Heley

Published

March 26, 2023

Objective

Extract the knowledge graph for a Chinese strategic submarine.

Data

We use the wikidata API to extract the knowledge graph from wiki.

The approach is to search wikidata for the item of interest and select the best match from the list of search results that return.

# function to search wiki data and return a tibble of search results
search_wikidata <- function(search){
  base <- "https://www.wikidata.org" 
  path <- "/w/api.php"
  query <- list(
    action="query",
    list="search",
    format = "json",
    srsearch = search
  )
  
  httr::GET(base, path = path, query = query) |>
    httr::content() |>
    purrr::pluck("query", "search") |>
    purrr::map_df(~{
      ul <- unlist(.x)
      name <- names(ul)
      value <- as.character(ul)
      dplyr::tibble(name, value) |>
        tidyr::pivot_wider()
      })
}

(search_results <- search_wikidata("Type-094 submarine"))
# A tibble: 2 × 6
  ns    title    pageid  wordcount snippet                               times…¹
  <chr> <chr>    <chr>   <chr>     <chr>                                 <chr>  
1 0     Q1203377 1146175 0         nuclear-powered ballistic missile su… 2023-0…
2 0     Q7008427 6886594 0         Wikimedia category                    2022-1…
# … with abbreviated variable name ¹​timestamp
# Extract the item title for the item of interest
root_item_title <- search_results$title[1]

We then extract wikidata associated with the item.

# extract the entity data for a chosen item.
get_entity_data <- function(item = "Q1203377"){
  base <- "https://www.wikidata.org/"
  path <- glue::glue("wiki/Special:EntityData/{item}.json")
  query <- list(flavor = "simple")  
  req <- httr::GET(base, path = path, query = query)
  
  httr::content(req) |>
    purrr::pluck("entities", item, "claims") |>
    purrr::map_df(~{
      value <- unlist(.x)
      name <- names(value)
      dplyr::tibble(name, value) |>
        tidyr::pivot_wider() |>
        tidyr::unnest()
      })
}

entity_data <- get_entity_data(root_item_title)
Warning: `cols` is now required when using unnest().
Please use `cols = c()`
`cols` is now required when using unnest().
Please use `cols = c()`
`cols` is now required when using unnest().
Please use `cols = c()`
`cols` is now required when using unnest().
Please use `cols = c()`
Warning: Values from `value` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = {summary_fun}` to summarise duplicates.
* Use the following dplyr code to identify duplicates.
  {data} %>%
    dplyr::group_by(name) %>%
    dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
    dplyr::filter(n > 1L)
Warning: `cols` is now required when using unnest().
Please use `cols = c(mainsnak.snaktype, mainsnak.property, mainsnak.hash, `mainsnak.datavalue.value.entity-type`, 
    `mainsnak.datavalue.value.numeric-id`, mainsnak.datavalue.value.id, 
    mainsnak.datavalue.type, mainsnak.datatype, type, id, rank)`
Warning: `cols` is now required when using unnest().
Please use `cols = c()`
`cols` is now required when using unnest().
Please use `cols = c()`
`cols` is now required when using unnest().
Please use `cols = c()`
`cols` is now required when using unnest().
Please use `cols = c()`
`cols` is now required when using unnest().
Please use `cols = c()`
`cols` is now required when using unnest().
Please use `cols = c()`
`cols` is now required when using unnest().
Please use `cols = c()`
Warning: Values from `value` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = {summary_fun}` to summarise duplicates.
* Use the following dplyr code to identify duplicates.
  {data} %>%
    dplyr::group_by(name) %>%
    dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
    dplyr::filter(n > 1L)
Warning: `cols` is now required when using unnest().
Please use `cols = c(mainsnak.snaktype, mainsnak.property, mainsnak.hash, `mainsnak.datavalue.value.entity-type`, 
    `mainsnak.datavalue.value.numeric-id`, mainsnak.datavalue.value.id, 
    mainsnak.datavalue.type, mainsnak.datatype, type, id, rank, 
    references.hash, references.snaks.P143.snaktype, references.snaks.P143.property, 
    references.snaks.P143.hash, `references.snaks.P143.datavalue.value.entity-type`, 
    `references.snaks.P143.datavalue.value.numeric-id`, references.snaks.P143.datavalue.value.id, 
    references.snaks.P143.datavalue.type, references.snaks.P143.datatype, 
    `references.snaks-order`)`
Warning: `cols` is now required when using unnest().
Please use `cols = c()`
`cols` is now required when using unnest().
Please use `cols = c()`
`cols` is now required when using unnest().
Please use `cols = c()`

And the associated properties

properties <- entity_data |> 
  dplyr::pull("mainsnak.property") |>
  unique()

get_entity_id <- function(id = "P373"){
  if(length(id)>1) id <- paste(id, collapse = "|")
  base <- "https://www.wikidata.org/"
  path <- "w/api.php"
  query <- list(
    action="wbgetentities",
    ids=id,
    languages="en",
    props="labels",
    format="json"
    )
  
  req <- httr::GET(base, path = path, query = query) 
  
  if(req$status != 200) 
    return(stop(glue::glue("Error returned status: {req$status}")))
  
  httr::content(req) |>
    purrr::pluck("entities") |>
    purrr::map_df(~{
      value <- unlist(.x)
      name <- names(value)
      dplyr::tibble(name, value) |> tidyr::pivot_wider()
    })
}

(prop_label <- get_entity_id(properties))
# A tibble: 16 × 5
   type     datatype        id    labels.en.language labels.en.value      
   <chr>    <chr>           <chr> <chr>              <chr>                
 1 property string          P373  en                 Commons category     
 2 property wikibase-item   P516  en                 powered by           
 3 property string          P561  en                 NATO reporting name  
 4 property wikibase-item   P31   en                 instance of          
 5 property wikibase-item   P279  en                 subclass of          
 6 property wikibase-item   P910  en                 topic's main category
 7 property external-id     P646  en                 Freebase ID          
 8 property time            P729  en                 service entry        
 9 property wikibase-item   P156  en                 followed by          
10 property wikibase-item   P155  en                 follows              
11 property commonsMedia    P18   en                 image                
12 property wikibase-item   P176  en                 manufacturer         
13 property wikibase-item   P137  en                 operator             
14 property monolingualtext P1813 en                 short name           
15 property wikibase-item   P520  en                 armament             
16 property wikibase-item   P495  en                 country of origin    

Get the item data

entity_data2 <- entity_data |>
  dplyr::select(
    mainsnak.property,
    mainsnak.datavalue.type,
    mainsnak.datavalue.value.id,
    mainsnak.datavalue.value,
    mainsnak.datavalue.value.text,
    mainsnak.datavalue.value.time
  ) |>
  dplyr::mutate(
    value = dplyr::case_when(
      !is.na(mainsnak.datavalue.value.id) ~ mainsnak.datavalue.value.id,
      !is.na(mainsnak.datavalue.value) ~ mainsnak.datavalue.value,
      !is.na(mainsnak.datavalue.value.time) ~ mainsnak.datavalue.value.time,
      !is.na(mainsnak.datavalue.value.text) ~ mainsnak.datavalue.value.text
    )
  ) |>
  dplyr::select(property = 1, type = 2, value) |>
  dplyr::distinct()

entity_data3 <- entity_data2 |>
  dplyr::left_join(prop_label |>
  dplyr::select(
    property = id, 
    property_label = labels.en.value
  ))
Joining, by = "property"
items <- entity_data3 |>
  dplyr::filter(type == "wikibase-entityid") |>
  dplyr::pull(value) |> 
  unique()

item_labels <- get_entity_id(items)

(item_property <- entity_data3 |>
  dplyr::left_join(
    item_labels |>
      dplyr::select(value = id, item_label = labels.en.value)
  ) |>
  dplyr::mutate(item_label = dplyr::case_when(is.na(item_label)~value, 
                                              TRUE ~ item_label)) |>
  dplyr::select(property_label, item_label))
Joining, by = "value"
# A tibble: 17 × 2
   property_label        item_label                                          
   <chr>                 <chr>                                               
 1 Commons category      Type 09IV submarines                                
 2 powered by            nuclear marine propulsion                           
 3 NATO reporting name   Jin                                                 
 4 instance of           submarine class                                     
 5 subclass of           ballistic missile submarine                         
 6 subclass of           nuclear submarine                                   
 7 topic's main category Category:Type 094 submarines                        
 8 Freebase ID           /m/09wz47                                           
 9 service entry         +2010-01-01T00:00:00Z                               
10 followed by           Type 096 submarine                                  
11 follows               Type 092 Daqingyu                                   
12 image                 Jin (Type 094) Class Ballistic Missile Submarine.JPG
13 manufacturer          Bohai Shipyard                                      
14 operator              People's Liberation Army Navy                       
15 short name            Type 094                                            
16 armament              JL-2                                                
17 country of origin     People's Republic of China                          

Get wiki urls

get_wikisites <- function(item = "Q1203377"){
  base <- "https://www.wikidata.org/"
  path <- glue::glue("wiki/Special:EntityData/{item}.json")
  query <- list(flavor = "simple")  
  req <- httr::GET(base, path = path, query = query)
  
  cont <- httr::content(req) 
  cont |>
    purrr::pluck("entities", item, "sitelinks") |>
    purrr::map_df(~{
      value <- unlist(.x)
      name <- names(value)
      dplyr::tibble(name, value) |>
        tidyr::pivot_wider()
      })
}

(wikiurl <- get_wikisites(root_item_title) |>
  dplyr::filter(site == "enwiki") |>
  dplyr::pull(url))
                                                   
"https://en.wikipedia.org/wiki/Type_094_submarine" 

Scrape the infobox from the wiki url

scrape_infobox <- function(
    url ="https://en.wikipedia.org/wiki/Type_094_submarine"
){
  req <- httr::GET(url)
  req |> 
    httr::content() |> 
    xml2::xml_find_all("//table[@class='infobox']") |>
    rvest::html_table() 
}

scrape_infobox("https://en.wikipedia.org/wiki/Type_094_submarine")
[[1]]
# A tibble: 22 × 2
   X1                      X2                               
   <chr>                   <chr>                            
 1 Profile of the Type 094 Profile of the Type 094          
 2 Type 094 submarine      Type 094 submarine               
 3 Class overview          Class overview                   
 4 Name                    Type 094 (Jin class)             
 5 Builders                Bohai Shipyard, Huludao, China[2]
 6 Operators               People's Liberation Army Navy    
 7 Preceded by             Type 092 (Xia class)             
 8 Succeeded by            Type 096                         
 9 Cost                    $750 million per unit[1]         
10 In commission           2007–present[2]                  
# … with 12 more rows

Scrape wikitables

scrape_wikitables <- function(
    url ="https://en.wikipedia.org/wiki/Type_094_submarine"
){
  req <- httr::GET(url)
  req |> 
    httr::content() |> 
    xml2::xml_find_all("//table[@class='wikitable']") |>
    rvest::html_table()
}

scrape_wikitables("https://en.wikipedia.org/wiki/Type_094_submarine")
[[1]]
# A tibble: 8 × 7
  Name                `Hull no.`  Builder         Laid …¹ Launc…² Commi…³ Status
  <chr>               <chr>       <chr>           <chr>   <chr>   <chr>   <chr> 
1 "Type 094"          "Type 094"  "Type 094"      "Type … "Type … Type 0… Type …
2 ""                  "411[2]"    "Bohai Shipyar… "2001[… "28 Ju… March … Active
3 "Changzheng 10[20]" "412[2]"    "Bohai Shipyar… "2003[… "2006[… 2010[2] Active
4 "Changzheng 11[20]" "413[2]"    "Bohai Shipyar… "2004[… "Decem… 2012[2] Active
5 "Changzheng 18[21]" "421[22]"   ""              ""      ""      23 Apr… Active
6 "Type 094A"         "Type 094A" "Type 094A"     "Type … "Type … Type 0… Type …
7 ""                  ""          ""              ""      ""      2020[5] Active
8 ""                  ""          ""              ""      ""      2020[5] Active
# … with abbreviated variable names ¹​`Laid down`, ²​Launched, ³​Commissioned