Retrieve details about Olympics 2020 medalists via Wikipedia and Wikidata

Giorgio Comai https://giorgiocomai.eu (OBCT/CCI - EDJNet)https://europeandatajournalism.eu/
2022-02-02

Load packages and set caching folders

This scripts makes use of some of the most common packages for R for data wrangling, scraping, and geocomputation.

It also makes use of two additional packages created within the scope of EDJNet:

By default, all data will be cached inside the active folder (likely, your project folder if you have cloned this with the likest of Rstudio). If you intend to use tidywikidatar or latlon2map in other projects, you may want to set the cache in a shared folder, as suggested in the code commented out below.

knitr::opts_chunk$set(echo = TRUE)

library("dplyr") 
library("rvest")
library("stringr")
library("sf")
library("jsonlite")
library("purrr")

# remotes::install_github("EDJNet/tidywikidatar")
library("tidywikidatar")
tw_enable_cache()
# tw_set_cache_folder(path = fs::path(fs::path_home_r(), "R", "tw_data"))
tw_set_language(language = "en")
tw_create_cache_folder(ask = FALSE)


# remotes::install_github("giocomai/latlon2map")
library("latlon2map")
# ll_set_folder(path = fs::path(fs::path_home_r(), "R", "ll_data"))
options(timeout = 60000) # give the time to download the needed datasets

Getting data out of Wikipedia

We’ll be getting all medals from this Wikipedia page. If you cloned this repository, you’ll find a version of this page cached locally. If you want to update it, just delete the local all_medals_wiki.html and re-download the latest version.

In this table, each row is a competition, each cell corresponds to one or more medal winners (for team sports).

if (fs::file_exists("all_medals_wiki.html")==FALSE) {
  download.file(url = "https://en.wikipedia.org/wiki/List_of_2020_Summer_Olympics_medal_winners",
                destfile = "all_medals_wiki.html")
}

all_medals_page <- read_html(x = "all_medals_wiki.html")

all_medal_tables <- all_medals_page %>% 
  html_nodes(css = "table")

Here I create some functions that I will use to extract data from the table. It’s a bit of a pain to extract data consistently, including relevant links, and there are likely easier way to do this, but this seems to work alright.

I extract separately athletes, and the country they won their medal for, and then pull the data together.

o20_get_medalist_from_cell <- function(row,
                                       td_number) {
  links <- row %>% 
    html_nodes("td") %>% 
    .[[td_number]] %>% 
    html_nodes("a") %>% 
    html_attr("href") 
  
  if (length(links)==0) {
    list(links)
  } else {
    links[stringr::str_detect(string = links,
                              pattern = "_at_the_2020_Summer_Olympics",
                              negate = TRUE)] %>% 
      list()
  }
}

o20_get_country_from_cell <- function(row, td_number) {
  links <- row %>% 
    html_nodes("td") %>% 
    .[[td_number]] %>% 
    html_nodes("a") %>% 
    html_attr("href") 
  
  if (length(links)==0) {
    as.character(NA)
  } else {
    links[stringr::str_detect(string = links,
                              pattern = "_at_the_2020_Summer_Olympics",
                              negate = FALSE)] 
  }
}



o20_get_medals_from_table <- function(table) {
  
  #print(class(table))
  table_colnames <- table %>%
    html_table() %>% 
    colnames()
  
  if (!(length(table_colnames)==4|length(table_colnames)==7|length(table_colnames)==6)) {
    return(NULL)
  }
  
  col4 <- table %>%
    html_table() %>% 
    colnames() %>% 
    .[4]
  
  if (col4 != "Bronze" & col4 != "Silver") {
    return(NULL)
  }
  
  
  current_table <- table %>% 
    html_nodes("tr") %>% 
    .[-1] #remove header
  
  purrr::map_dfr(
    .x = current_table,
    .f = function(current_row) {
      # print(current_row %>%
      #         html_nodes("td") %>%
      #         .[[1]] %>%  html_text())
      
      col_number <- current_row %>% 
        html_nodes("td") %>% 
        length()
      
      
      if (col_number==4) {
        
        
        medal_row_df <- tibble::tibble(event_link = current_row %>% 
                                         html_nodes("td") %>% 
                                         .[[1]] %>% 
                                         html_nodes("a") %>% 
                                         html_attr("href"), 
                                       gold_medalist = current_row %>% 
                                         o20_get_medalist_from_cell(td_number = 2), 
                                       gold_country = current_row %>% 
                                         o20_get_country_from_cell(td_number = 2),
                                       silver_medalist = current_row %>% 
                                         o20_get_medalist_from_cell(td_number = 3), 
                                       silver_country = current_row %>% 
                                         o20_get_country_from_cell(td_number = 3),
                                       bronze_medalist = current_row %>% 
                                         o20_get_medalist_from_cell(td_number = 4), 
                                       bronze_country = current_row %>% 
                                         o20_get_country_from_cell(td_number = 4),
        ) 
      } else if (col_number==7) {
        
        
        medal_row_df <- tibble::tibble(event_link = current_row %>% 
                                         html_nodes("td") %>% 
                                         .[[1]] %>% 
                                         html_nodes("a") %>% 
                                         html_attr("href"), 
                                       gold_medalist = current_row %>% 
                                         o20_get_medalist_from_cell(td_number = 2), 
                                       gold_country = current_row %>% 
                                         o20_get_country_from_cell(td_number = 2),
                                       silver_medalist = current_row %>% 
                                         o20_get_medalist_from_cell(td_number = 4), 
                                       silver_country = current_row %>% 
                                         o20_get_country_from_cell(td_number = 4),
                                       bronze_medalist = current_row %>% 
                                         o20_get_medalist_from_cell(td_number = 6), 
                                       bronze_country = current_row %>% 
                                         o20_get_country_from_cell(td_number = 6)
        ) 
      } else if (col_number==6) {
        
        
        medal_row_df <- tibble::tibble(event_link = current_row %>% 
                                         html_nodes("td") %>% 
                                         .[[1]] %>% 
                                         html_nodes("a") %>% 
                                         html_attr("href"), 
                                       gold_medalist = current_row %>% 
                                         o20_get_medalist_from_cell(td_number = 2), 
                                       gold_country = current_row %>% 
                                         o20_get_country_from_cell(td_number = 2),
                                       silver_medalist = current_row %>% 
                                         o20_get_medalist_from_cell(td_number = 4), 
                                       silver_country = current_row %>% 
                                         o20_get_country_from_cell(td_number = 4),
                                       bronze_medalist = current_row %>% 
                                         o20_get_medalist_from_cell(td_number = 5), 
                                       bronze_country = current_row %>% 
                                         o20_get_country_from_cell(td_number = 5)
        ) 
      } else {
        return(NULL)
      }
      
      
      
      dplyr::bind_rows(medal_row_df %>% 
                         dplyr::transmute(event_link,
                                          medal = "gold",
                                          medalist_link = gold_medalist,
                                          delegation_link = gold_country),
                       medal_row_df %>% 
                         dplyr::transmute(event_link,
                                          medal = "silver",
                                          medalist_link = silver_medalist,
                                          delegation_link = silver_country),
                       medal_row_df %>% 
                         dplyr::transmute(event_link,
                                          medal = "bronze",
                                          medalist_link = bronze_medalist,
                                          delegation_link = bronze_country) 
                       
      ) %>% 
        tidyr::unnest(cols = "medalist_link") %>% 
        dplyr::mutate(medalist_link = as.character(medalist_link)) %>% 
        dplyr::filter(stringr::str_detect(string = medalist_link,
                                          pattern =  "#endnote",
                                          negate = TRUE))
      
    })
}

Once these functions are defined, then everything looks super-easy. Here’s how the data look like at this stage.

all_medalists_df_pre <- purrr::map_dfr(.x = all_medal_tables,
                                       .f = function(table) {
                                         o20_get_medals_from_table(table)
                                       })

all_medalists_df_pre$medalist_link[all_medalists_df_pre$medalist_link == "/wiki/Gianmarco_Tamberi"&all_medalists_df_pre$delegation_link=="/wiki/Qatar_at_the_2020_Summer_Olympics"] <- "/wiki/Mutaz_Essa_Barshim"

all_medalists_df_pre$medalist_link[all_medalists_df_pre$medalist_link == "/wiki/Mutaz_Essa_Barshim"&all_medalists_df_pre$delegation_link=="/wiki/Italy_at_the_2020_Summer_Olympics"] <- "/wiki/Gianmarco_Tamberi"


## add manually bronze medals shared by two athletes... less than ideal, but not in the mood to fix the parser now

## update: I didn't realise there were so many double-bronze... a tiny fix above would have been much quicker, but anyways...

double_medals <- tibble::tribble(~event_link, ~medal, ~medalist_link, ~delegation_link,
                                 
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_flyweight", "bronze", "/wiki/Saken_Bibossinov", "/wiki/Kazakhstan_at_the_2020_Summer_Olympics",                             "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_featherweight","bronze", "/wiki/L%C3%A1zaro_%C3%81lvarez", "/wiki/Cuba_at_the_2020_Summer_Olympics",
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_lightweight", "bronze", "/wiki/Harry_Garside", "/wiki/Australia_at_the_2020_Summer_Olympics",
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_welterweight", "bronze", "/wiki/Aidan_Walsh_(boxer)", "/wiki/Ireland_at_the_2020_Summer_Olympics",
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_middleweight", "bronze", "/wiki/Gleb_Bakshi", "/wiki/ROC_at_the_2020_Summer_Olympics", 
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_light_heavyweight", "bronze", "/wiki/Imam_Khataev", "/wiki/ROC_at_the_2020_Summer_Olympics",
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_heavyweight", "bronze", "/wiki/Abner_Teixeira", "/wiki/Brazil_at_the_2020_Summer_Olympics",
                                 
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_super_heavyweight", "bronze", "/wiki/Kamshybek_Kunkabayev", "/wiki/Kazakhstan_at_the_2020_Summer_Olympics",
                                 
                                 
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_flyweight", "bronze", "/wiki/Tsukimi_Namiki", "/wiki/Japan_at_the_2020_Summer_Olympics",
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_featherweight", "bronze", "/wiki/Karriss_Artingstall", "/wiki/Great_Britain_at_the_2020_Summer_Olympics",
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_featherweight", "bronze", "/wiki/Irma_Testa", "/wiki/Italy_at_the_2020_Summer_Olympics",
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_lightweight", "bronze", "/wiki/Mira_Potkonen", "/wiki/Finland_at_the_2020_Summer_Olympics",
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_welterweight", "bronze", "/wiki/Oshae_Jones", "/wiki/United_States_at_the_2020_Summer_Olympics",
                                 "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_middleweight", "bronze", "/wiki/Zemfira_Magomedalieva", "/wiki/ROC_at_the_2020_Summer_Olympics",
                                 
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_60_kg", "bronze", "/wiki/Luka_Mkheidze", "/wiki/France_at_the_2020_Summer_Olympics", 
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_66_kg", "bronze", "/wiki/Daniel_Cargnin_(judoka)", "/wiki/Brazil_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_73_kg", "bronze", "/wiki/Tsend-Ochiryn_Tsogtbaatar", "/wiki/Mongolia_at_the_2020_Summer_Olympics", 
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_81_kg", "bronze", "/wiki/Matthias_Casse", "/wiki/Belgium_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_90_kg", "bronze", "/wiki/Kriszti%C3%A1n_T%C3%B3th", "/wiki/Hungary_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_100_kg", "bronze", "/wiki/Niyaz_Ilyasov", "/wiki/ROC_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_%2B100_kg", "bronze", "/wiki/Tamerlan_Bashaev", "/wiki/ROC_at_the_2020_Summer_Olympics",
                                 
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_48_kg", "bronze", "/wiki/Urantsetseg_Munkhbat", "/wiki/Mongolia_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_52_kg", "bronze", "/wiki/Chelsie_Giles", "/wiki/Great_Britain_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_57_kg", "bronze", "/wiki/Tsukasa_Yoshida", "/wiki/Japan_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_63_kg", "bronze", "/wiki/Catherine_Beauchemin-Pinard", "/wiki/Canada_at_the_2020_Summer_Olympics", 
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_70_kg", "bronze", "/wiki/Sanne_van_Dijke", "/wiki/Netherlands_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_78_kg", "bronze", "/wiki/Mayra_Aguiar", "/wiki/Brazil_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_%2B78_kg", "bronze", "/wiki/Romane_Dicko", "/wiki/France_at_the_2020_Summer_Olympics",
                                 
                                 
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Tohar_Butbul", "/wiki/Israel_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Raz_Hershko", "/wiki/Israel_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Li_Kochman", "/wiki/Israel_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Inbar_Lanir", "/wiki/Israel_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Sagi_Muki", "/wiki/Israel_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Timna_Nelson-Levy", "/wiki/Israel_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Peter_Paltchik", "/wiki/Israel_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Shira_Rishony", "/wiki/Israel_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Or_Sasson", "/wiki/Israel_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Gili_Sharir", "/wiki/Israel_at_the_2020_Summer_Olympics",
                                 "/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Baruch_Shmailov", "/wiki/Israel_at_the_2020_Summer_Olympics",
                                 
                                 
                                 
                                 "/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_kata", "bronze", "/wiki/Ali_Sofuo%C4%9Flu", "/wiki/Turkey_at_the_2020_Summer_Olympics",
                                 "/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_67_kg", "bronze", "/wiki/Abdelrahman_Al-Masatfa", "/wiki/Jordan_at_the_2020_Summer_Olympics",
                                 "/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_75_kg", "bronze", "/wiki/Stanislav_Horuna", "/wiki/Ukraine_at_the_2020_Summer_Olympics",
                                 "/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_%2B75_kg", "bronze", "/wiki/U%C4%9Fur_Akta%C5%9F_(karateka)", "/wiki/Turkey_at_the_2020_Summer_Olympics", 
                                 
                                 "/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_kata", "bronze", "/wiki/Viviana_Bottaro", "/wiki/Italy_at_the_2020_Summer_Olympics",
                                 "/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_55_kg", "bronze", "/wiki/Wen_Tzu-yun", "/wiki/Chinese_Taipei_at_the_2020_Summer_Olympics",
                                 "/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_61_kg", "bronze", "/wiki/Merve_%C3%87oban", "/wiki/Turkey_at_the_2020_Summer_Olympics", 
                                 "/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_%2B61_kg", "bronze", "/wiki/Sofya_Berultseva", "/wiki/Ukraine_at_the_2020_Summer_Olympics", 
                                 
                                 
                                 
                                 "/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_58_kg", "bronze", "/wiki/Mikhail_Artamonov_(taekwondo)", "/wiki/ROC_at_the_2020_Summer_Olympics",
                                 "/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_68_kg", "bronze", "/wiki/Zhao_Shuai", "/wiki/China_at_the_2020_Summer_Olympics",
                                 "/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_80_kg", "bronze", "/wiki/Seif_Eissa", "/wiki/Egypt_at_the_2020_Summer_Olympics",
                                 "/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_%2B80_kg", "bronze", "/wiki/Rafael_Alba", "/wiki/Cuba_at_the_2020_Summer_Olympics",
                                 
                                 "/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_49_kg", "bronze", "/wiki/Avishag_Semberg", "/wiki/Israel_at_the_2020_Summer_Olympics",
                                 "/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_57_kg", "bronze", "/wiki/Hatice_K%C3%BCbra_%C4%B0lg%C3%BCn", "/wiki/Turkey_at_the_2020_Summer_Olympics",
                                 "/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_67_kg", "bronze", "/wiki/Hedaya_Wahba", "/wiki/Egypt_at_the_2020_Summer_Olympics",
                                 "/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_%2B67_kg", "bronze", "/wiki/Bianca_Walkden", "/wiki/Great_Britain_at_the_2020_Summer_Olympics",
                                 
                                 
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_57_kg", "bronze", "/wiki/Thomas_Gilman", "/wiki/United_States_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_65_kg", "bronze", "/wiki/Bajrang_Punia", "/wiki/India_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_74_kg", "bronze", "/wiki/Bekzod_Abdurakhmonov", "/wiki/Uzbekistan_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_86_kg", "bronze", "/wiki/Myles_Amine", "/wiki/San_Marino_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_97_kg", "bronze", "/wiki/Abraham_Conyedo", "/wiki/Italy_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_125_kg", "bronze", "/wiki/Taha_Akg%C3%BCl", "/wiki/Turkey_at_the_2020_Summer_Olympics",
                                 
                                 
                                 
                                 
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_60_kg", "bronze", "/wiki/Sergey_Emelin", "/wiki/ROC_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_67_kg", "bronze", "/wiki/Mohamed_Ibrahim_El-Sayed", "/wiki/Egypt_at_the_2020_Summer_Olympics", 
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_77_kg", "bronze", "/wiki/Rafig_Huseynov", "/wiki/Azerbaijan_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_87_kg", "bronze", "/wiki/Zurab_Datunashvili", "/wiki/Serbia_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_97_kg", "bronze", "/wiki/Mohammad_Hadi_Saravi", "/wiki/Iran_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_130_kg", "bronze", "/wiki/Sergey_Semenov_(wrestler)", "/wiki/ROC_at_the_2020_Summer_Olympics",
                                 
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_50_kg", "bronze", "/wiki/Sarah_Hildebrandt", "/wiki/United_States_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_53_kg", "bronze", "/wiki/Bat-Ochiryn_Bolortuyaa", "/wiki/Mongolia_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_57_kg", "bronze", "/wiki/Evelina_Nikolova", "/wiki/Bulgaria_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_62_kg", "bronze", "/wiki/Taybe_Yusein", "/wiki/Bulgaria_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_68_kg", "bronze", "/wiki/Meerim_Zhumanazarova", "/wiki/Kyrgyzstan_at_the_2020_Summer_Olympics",
                                 "/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_76_kg", "bronze", "/wiki/Zhou_Qian", "/wiki/China_at_the_2020_Summer_Olympics",
                                 
                                 # fix Barshim moved to separate row
                                 
                                 "/wiki/Athletics_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_high_jump", "gold", "/wiki/Mutaz_Essa_Barshim", "/wiki/Qatar_at_the_2020_Summer_Olympics",
                                 
                                 # fix missing competition for whatever reason
                                 
                                 "/wiki/Canoeing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_C-1_1000_metres", "gold", "/wiki/Isaquias_Queiroz", "/wiki/Brazil_at_the_2016_Summer_Olympics",
                                 "/wiki/Canoeing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_C-1_1000_metres", "silver", "/wiki/Liu_Hao_(canoeist)", "/wiki/China_at_the_2016_Summer_Olympics",
                                 "/wiki/Canoeing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_C-1_1000_metres", "bronze", "/wiki/Serghei_Tarnovschi", "/wiki/Moldova_at_the_2016_Summer_Olympics",
                                 
                                 
                                 
)

all_medalists_df <- bind_rows(all_medalists_df_pre,
                              double_medals) %>% 
  dplyr::filter(stringr::str_detect(string = medalist_link,pattern = "#", negate = TRUE)) %>% 
  distinct(event_link, medalist_link, .keep_all = TRUE)


####  check: 
# dplyr::group_by(event_link, medalist_link) %>% 
# count() %>% 
# dplyr::filter(n>1)

# nrow(all_medalists_df_pre)+nrow(double_medals)





all_medalists_df %>% 
  dplyr::slice_sample(n = 10) %>% 
  knitr::kable()
event_link medal medalist_link delegation_link
/wiki/Rugby_sevens_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_tournament silver /wiki/Caroline_Drouin /wiki/France_at_the_2020_Summer_Olympics
/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_57_kg bronze /wiki/Helen_Maroulis /wiki/United_States_at_the_2020_Summer_Olympics
/wiki/Swimming_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_4_%C3%97_100_metre_medley_relay gold /wiki/Adam_Peaty /wiki/Great_Britain_at_the_2020_Summer_Olympics
/wiki/Swimming_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_4_%C3%97_100_metre_medley_relay gold /wiki/Emily_Seebohm /wiki/Australia_at_the_2020_Summer_Olympics
/wiki/Table_tennis_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_team bronze /wiki/Minnie_Soo_Wai_Yam /wiki/Hong_Kong_at_the_2020_Summer_Olympics
/wiki/Weightlifting_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_%2B109_kg silver /wiki/Ali_Davoudi /wiki/Iran_at_the_2020_Summer_Olympics
/wiki/Football_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_tournament silver /wiki/Amanda_Ilestedt /wiki/Sweden_at_the_2020_Summer_Olympics
/wiki/Cycling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_road_time_trial gold /wiki/Primo%C5%BE_Rogli%C4%8D /wiki/Slovenia_at_the_2020_Summer_Olympics
/wiki/Swimming_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_400_metre_freestyle silver /wiki/Jack_McLoughlin /wiki/Australia_at_the_2020_Summer_Olympics
/wiki/Softball_at_the_2020_Summer_Olympics gold /wiki/Nayu_Kiyohara /wiki/Japan_at_the_2020_Summer_Olympics

So we have links to Wikipedia pages, which is a good start. But Wikipedia does not really have much structured data, so we need to find the Wikidata identifier for each of those Wikipedia pages. We could download the full html of each of those pages and extract the link from there, but we’ll play it nicely and use the Wikidata API, and parse the json we obtain from it.

Get Wikidata identifiers

So we’ll cache locally the json file we get from the Wikipedia page, and and associated to each of those links a Wikidata identifier.

For reference, here’s how a random Wikipedia API page looks like:

fs::dir_create("wikipedia_medalists_json")


wiki_api_links <- stringr::str_c("https://en.wikipedia.org/w/api.php?action=query&redirects=true&prop=pageprops&titles=",
                                 stringr::str_remove(all_medalists_df$medalist_link, "/wiki/"),
                                 "&format=json")

local_json_files <- fs::path("wikipedia_medalists_json", 
                             all_medalists_df$medalist_link %>%
                               fs::path_file() %>%
                               paste0(., ".json"))


sample(wiki_api_links, size = 1)
[1] "https://en.wikipedia.org/w/api.php?action=query&redirects=true&prop=pageprops&titles=Li_Bingjie&format=json"
o2020_get_wikidata_from_link <- function(link) {
  
  wiki_api_links_country <- stringr::str_c("https://en.wikipedia.org/w/api.php?action=query&redirects=true&prop=pageprops&titles=",
                                           stringr::str_remove(link, "/wiki/"),
                                           "&format=json")
  
  local_json_files_country <- fs::path("wikipedia_medalists_json", 
                                       link %>%
                                         fs::path_file() %>%
                                         paste0(., ".json"))
  
  wikidata_id <- purrr::map2_chr(
    .x = wiki_api_links_country,
    .y = local_json_files_country,
    .f = function(x, y) {
      
      if (fs::file_exists(y)==FALSE) {
        download.file(url = x, destfile = y)
        Sys.sleep(1)
      }
      
      jsonlite::read_json(path = y)
      output <- jsonlite::read_json(path = y) %>% 
        purrr::pluck("query",
                     "pages",
                     1,
                     "pageprops",
                     "wikibase_item")
      
      if (length(output)==0) {
        as.character(NA)
      } else {
        output
      }
    })
  
  #tibble::tibble(link = link, wikidata_id, wikidata_id)
  wikidata_id
  
}

## check
# nrow(all_medalists_df)==length(wikidata_id)
# wiki_api_links[is.na(wikidata_id_medalists)]

I will keep the cached json files in the repo, so you don’t have to download them again; the Wikidata identifier should not change anyway.

all_medalists_wd_df <- all_medalists_df %>% 
  dplyr::mutate(medalist_wikidata_id = o2020_get_wikidata_from_link(medalist_link),
                event_wikidata_id = o2020_get_wikidata_from_link(event_link),
                delegation_wikidata_id = o2020_get_wikidata_from_link(delegation_link)) %>% 
  dplyr::mutate(event_link = paste0("https://en.wikipedia.org", event_link),
                medalist_link = paste0("https://en.wikipedia.org", medalist_link),
                delegation_link = paste0("https://en.wikipedia.org", delegation_link))
# due to frequent failures, consider something like this at first run
while (length(fs::dir_ls("wikipedia_medalists_json")<2600)) {
  
  try(all_medalists_wd_df <- all_medalists_df %>% 
        dplyr::mutate(medalist_wikidata_id = o2020_get_wikidata_from_link(medalist_link),
                      event_wikidata_id = o2020_get_wikidata_from_link(event_link),
                      delegation_wikidata_id = o2020_get_wikidata_from_link(delegation_link)) %>% 
        dplyr::mutate(event_link = paste0("https://en.wikipedia.org", event_link),
                      medalist_link = paste0("https://en.wikipedia.org", medalist_link),
                      delegation_link = paste0("https://en.wikipedia.org", delegation_link)))
}
all_medalists_wd_df %>% 
  dplyr::slice_sample(n = 10) %>% 
  knitr::kable()
event_link medal medalist_link delegation_link medalist_wikidata_id event_wikidata_id delegation_wikidata_id
https://en.wikipedia.org/wiki/Athletics_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_100_metres_hurdles bronze https://en.wikipedia.org/wiki/Megan_Tapper https://en.wikipedia.org/wiki/Jamaica_at_the_2020_Summer_Olympics Q26406052 Q66091470 Q42914563
https://en.wikipedia.org/wiki/Athletics_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_4_%C3%97_100_metres_relay silver https://en.wikipedia.org/wiki/Nethaneel_Mitchell-Blake https://en.wikipedia.org/wiki/Great_Britain_at_the_2020_Summer_Olympics Q14914875 Q66797109 Q50014472
https://en.wikipedia.org/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_flyweight gold https://en.wikipedia.org/wiki/Stoyka_Krasteva https://en.wikipedia.org/wiki/Bulgaria_at_the_2020_Summer_Olympics Q2107331 Q65241562 Q42913733
https://en.wikipedia.org/wiki/Football_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_tournament silver https://en.wikipedia.org/wiki/Jon_Moncayola https://en.wikipedia.org/wiki/Spain_at_the_2020_Summer_Olympics Q66528894 Q61056311 Q42914281
https://en.wikipedia.org/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team bronze https://en.wikipedia.org/wiki/Theresa_Stoll https://en.wikipedia.org/wiki/Germany_at_the_2020_Summer_Olympics Q29512026 Q65235579 Q42914232
https://en.wikipedia.org/wiki/Cycling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_team_pursuit bronze https://en.wikipedia.org/wiki/Lily_Williams_(cyclist) https://en.wikipedia.org/wiki/United_States_at_the_2020_Summer_Olympics Q29168606 Q67151040 Q42914289
https://en.wikipedia.org/wiki/Handball_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_tournament bronze https://en.wikipedia.org/wiki/Vilde_Johansen https://en.wikipedia.org/wiki/Norway_at_the_2020_Summer_Olympics Q19974501 Q32639825 Q50014452
https://en.wikipedia.org/wiki/Baseball_at_the_2020_Summer_Olympics gold https://en.wikipedia.org/wiki/K%C5%8Dy%C5%8D_Aoyagi https://en.wikipedia.org/wiki/Japan_at_the_2020_Summer_Olympics Q22117711 Q39080747 Q42914581
https://en.wikipedia.org/wiki/Swimming_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_4_%C3%97_100_metre_medley_relay bronze https://en.wikipedia.org/wiki/Taylor_Ruck https://en.wikipedia.org/wiki/Canada_at_the_2020_Summer_Olympics Q20973742 Q64514187 Q42913821
https://en.wikipedia.org/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_78_kg bronze https://en.wikipedia.org/wiki/Anna-Maria_Wagner https://en.wikipedia.org/wiki/Germany_at_the_2020_Summer_Olympics Q52768062 Q65235568 Q42914232

Are these all Olympics medalists? We now have 2175 and 339 events, which correspond to the figure given in the opening line of the Wikipedia page used as the source for this endeavour.

Apparently, yes.

So now we have successfully associated Wikidata identifiers with the relevant Wikipedia pages. Let the Wikidata-based fun begin.

Getting the information we need from Wikidata

Now that we have a Wikidata id, we’re on a downward slope to get a lot more data without much pain. Look at the beauty of the massive piped call that follows, with each line getting more and more data.

all_medalists_with_wd_details_df <- all_medalists_wd_df %>% 
  mutate(medalist_name = tw_get_label(medalist_wikidata_id),
         place_of_birth_wikidata_id = tw_get_property_same_length(id = medalist_wikidata_id,
                                                                  p = "P19",
                                                                  only_first = TRUE,
                                                                  preferred = TRUE), 
         date_of_birth = tw_get_property_same_length(id = medalist_wikidata_id,
                                                     p = "P569",
                                                     only_first = TRUE,
                                                     preferred = TRUE), 
         event_sport_wikidata_id = tw_get_property_same_length(id = event_wikidata_id,
                                                               p = "P361",
                                                               only_first = TRUE,
                                                               preferred = TRUE),
         event_part_of_wikidata_id = tw_get_property_same_length(id = event_wikidata_id,
                                                                 p = "P641",
                                                                 only_first = TRUE,
                                                                 preferred = TRUE),
         event_name = tw_get_label(event_wikidata_id),
         delegation_name = tw_get_label(delegation_wikidata_id), 
         sex_or_gender_wikidata_id = tw_get_property_same_length(id = medalist_wikidata_id,
                                                                 p = "P21",
                                                                 only_first = TRUE,
                                                                 preferred = TRUE)) %>% 
  
  
  mutate(country_medal_wikidata_id = tw_get_property_same_length(id = delegation_wikidata_id,
                                                                 p = "P17",
                                                                 only_first = TRUE, 
                                                                 preferred = TRUE),
         event_sport = tw_get_label(event_sport_wikidata_id),
         event_part_of = tw_get_label(event_part_of_wikidata_id),
         place_of_birth = tw_get_label(place_of_birth_wikidata_id),
         place_of_birth_located_in_wikidata_id = tw_get_property_same_length(id = place_of_birth_wikidata_id,
                                                                             p = "P131",
                                                                             only_first = TRUE,
                                                                             preferred = TRUE),
         place_of_birth_coordinates = tw_get_property_same_length(id = place_of_birth_wikidata_id,
                                                                  p = "P625",
                                                                  only_first = TRUE,
                                                                  preferred = TRUE),
         date_of_birth = stringr::str_extract(string = date_of_birth,
                                              pattern = "[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}") %>% 
           as.Date(),
         sex_or_gender = tw_get_label(sex_or_gender_wikidata_id)
  ) %>% 
  
  
  tidyr::separate(col = place_of_birth_coordinates,
                  into = c("lat", "lon"),
                  sep = ",",
                  remove = FALSE) %>% 
  
  
  mutate(country_medal = tw_get_label(country_medal_wikidata_id),
         place_of_birth_located_in = tw_get_label(place_of_birth_located_in_wikidata_id))

Now, life’s not perfect, and so isn’t Wikidata. Some athletes do not have a complete file on them on Wikidata, so we may miss some places of birth, for example. Also, in order to reduce complexity of the output, I have limited retrieval of some properties to the first result. For example, Wikidata knows that a place is located within more than one “administrative territorial entity” (or was at different points in time), but I just kept the first result to keep the data in tabular format and reduce complexity.

To aggregate results consistently at the level of region/province/state, I will rely on geographic datasets and match the coordinates with the place of birth. For the time being, I’ll do this only for Europe (all countries which are part of the NUTS standard). It should be relatively easy to expand to other countries if there’s a geographic dataset to do the matching, but this is a start, and data extracted from Wikidata will have to do for the rest of the world for the time being.

Geo-matching with NUTS

So let’s match the coordinates of the place of birth of each medalist with the geographic dataset of NUTS regions.

all_medalists_wd_nuts_df <- all_medalists_with_wd_details_df %>% 
  dplyr::left_join(
    all_medalists_with_wd_details_df %>% 
      dplyr::filter(is.na(lon)==FALSE) %>% 
      dplyr::distinct(medalist_wikidata_id, .keep_all = TRUE) %>% 
      sf::st_as_sf(coords = c("lon","lat"), crs = 4326) %>% 
      sf::st_join(latlon2map::ll_get_nuts_eu(level = 1, year = 2016),
                  join = sf::st_intersects) %>% 
      st_drop_geometry() %>% 
      dplyr::transmute(medalist_wikidata_id,
                       nuts1_id = NUTS_ID,
                       nuts1_name = NAME_LATN),
    by = "medalist_wikidata_id") %>% 
  dplyr::left_join(
    all_medalists_with_wd_details_df %>% 
      dplyr::filter(is.na(lon)==FALSE) %>% 
      dplyr::distinct(medalist_wikidata_id, .keep_all = TRUE) %>% 
      sf::st_as_sf(coords = c("lon","lat"), crs = 4326) %>% 
      sf::st_join(latlon2map::ll_get_nuts_eu(level = 2, year = 2016),
                  join = sf::st_intersects) %>% 
      st_drop_geometry() %>% 
      dplyr::transmute(medalist_wikidata_id,
                       nuts2_id = NUTS_ID,
                       nuts2_name = NAME_LATN),
    by = "medalist_wikidata_id") %>% 
  dplyr::left_join(
    all_medalists_with_wd_details_df %>% 
      dplyr::filter(is.na(lon)==FALSE) %>% 
      dplyr::distinct(medalist_wikidata_id, .keep_all = TRUE) %>% 
      sf::st_as_sf(coords = c("lon","lat"), crs = 4326) %>% 
      sf::st_join(latlon2map::ll_get_nuts_eu(level = 3, year = 2016),
                  join = sf::st_intersects) %>% 
      st_drop_geometry() %>% 
      dplyr::transmute(medalist_wikidata_id,
                       nuts3_id = NUTS_ID,
                       nuts3_name = NAME_LATN),
    by = "medalist_wikidata_id") 

Add population data

https://ec.europa.eu/eurostat/web/products-datasets/product?code=tgs00096

library("eurostat")

fs::dir_create("eurostat")

population_by_nuts2_2020_file <- fs::path("eurostat",
                                          "population_by_nuts2_2020.csv")

if (fs::file_exists(population_by_nuts2_2020_file)==FALSE) {
  eurostat::get_eurostat(id = "tgs00096", filters = list(age = "TOTAL",
                                                         sex = "T")) %>% 
    dplyr::arrange(geo, dplyr::desc(time)) %>% 
    dplyr::filter(is.na(values)==FALSE) %>% 
    dplyr::group_by(geo) %>% 
    dplyr::slice(1) %>% 
    dplyr::ungroup() %>% 
    tidyr::drop_na() %>% 
    dplyr::transmute(nuts2_id = geo, nuts2_population = values) %>% 
    dplyr::filter(nuts2_population!=0) %>% 
    readr::write_csv(population_by_nuts2_2020_file)
}

population_by_nuts2 <- readr::read_csv(file = population_by_nuts2_2020_file, show_col_types = FALSE)



population_by_nuts3_2020_file <- fs::path("eurostat",
                                          "population_by_nuts3_2020.csv")

if (fs::file_exists(population_by_nuts3_2020_file)==FALSE) {
  eurostat::get_eurostat(id = "demo_r_pjanaggr3",
                         filters = list(age = "TOTAL",
                                        sex = "T")) %>% 
    dplyr::arrange(geo, dplyr::desc(time)) %>% 
    dplyr::filter(is.na(values)==FALSE) %>% 
    dplyr::group_by(geo) %>% 
    dplyr::slice(1) %>% 
    dplyr::ungroup() %>% 
    tidyr::drop_na() %>% 
    dplyr::transmute(nuts3_id = geo, nuts3_population = values) %>% 
    dplyr::filter(nuts3_population!=0) %>% 
    dplyr::filter(nchar(nuts3_id)==5) %>% 
    readr::write_csv(population_by_nuts3_2020_file)
}

population_by_nuts3 <- readr::read_csv(file = population_by_nuts3_2020_file, show_col_types = FALSE)


if (fs::file_exists(fs::path("eurostat",
                             "gdp_by_nuts3_2019.csv"))==FALSE) {
  
  gdp_by_nuts_df <- eurostat::get_eurostat(id = "nama_10r_3gdp",
                                           filters = list(unit = "MIO_EUR"))  %>%
    dplyr::arrange(geo, dplyr::desc(time)) %>% 
    dplyr::filter(is.na(values)==FALSE) %>% 
    dplyr::group_by(geo) %>% 
    dplyr::slice(1) %>% 
    dplyr::ungroup() %>% 
    tidyr::drop_na() %>% 
    dplyr::filter(values !=0) 
  
  
  gdp_by_nuts_df %>% 
    dplyr::transmute(nuts2_id = geo, nuts2_gdp = values) %>% 
    dplyr::filter(nchar(nuts2_id)==4) %>% 
    readr::write_csv(file = fs::path("eurostat",
                                     "gdp_by_nuts2_2019.csv"))
  
  gdp_by_nuts_df %>% 
    dplyr::transmute(nuts3_id = geo, nuts3_gdp = values) %>% 
    dplyr::filter(nchar(nuts3_id)==5) %>%
    readr::write_csv(file = fs::path("eurostat",
                                     "gdp_by_nuts3_2019.csv"))
  
  
}

gdp_by_nuts2_2019_df <- readr::read_csv(fs::path("eurostat",
                                                 "gdp_by_nuts2_2019.csv"),
                                        show_col_types = FALSE)

gdp_by_nuts3_2019_df <- readr::read_csv(fs::path("eurostat",
                                                 "gdp_by_nuts3_2019.csv"),
                                        show_col_types = FALSE)
all_medalists_wd_nuts_pop_df <- all_medalists_wd_nuts_df %>% 
  dplyr::left_join(y = population_by_nuts2, by = "nuts2_id") %>% 
  dplyr::left_join(y = population_by_nuts3, by = "nuts3_id") %>% 
  dplyr::left_join(y = gdp_by_nuts2_2019_df, by = "nuts2_id") %>% 
  dplyr::left_join(y = gdp_by_nuts3_2019_df, by = "nuts3_id") %>% 
  dplyr::mutate(nuts0_id = stringr::str_extract(string = nuts2_id, pattern = "[A-Z]{2}")) %>% 
  dplyr::mutate(nuts0_name = dplyr::case_when(is.na(nuts0_id) ~ as.character(NA),
                                              nuts0_id == "EL" ~ "Greece", 
                                              nuts0_id == "UK" ~ "United Kingdom", 
                                              TRUE ~ countrycode::countrycode(sourcevar = nuts0_id,
                                                                              origin = "iso2c",
                                                                              destination = "country.name"))) %>% 
  
  dplyr::select(event_link, 
                event_wikidata_id,
                event_name,
                event_sport_wikidata_id,
                event_sport,
                event_part_of_wikidata_id,
                event_part_of,
                medal, 
                medalist_link, 
                medalist_wikidata_id,
                medalist_name,
                delegation_link,
                delegation_wikidata_id,
                delegation_name,
                country_medal_wikidata_id,
                country_medal,
                date_of_birth,
                sex_or_gender_wikidata_id,
                sex_or_gender,
                place_of_birth_wikidata_id,
                place_of_birth,
                place_of_birth_coordinates,
                lat,
                lon,
                place_of_birth_located_in_wikidata_id,
                place_of_birth_located_in, 
                nuts0_id,
                nuts0_name,
                nuts1_id, 
                nuts1_name,
                nuts2_id,
                nuts2_name,
                nuts2_population,
                nuts2_gdp,
                nuts3_id,
                nuts3_name,
                nuts3_population,
                nuts3_gdp)

How complete are the results?

Let’s check how many missing data we have for each column of data:

tibble::tibble(columns = colnames(all_medalists_wd_nuts_pop_df),
               total = nrow(all_medalists_wd_nuts_pop_df),
               total_missing = purrr::map_dbl(.x = all_medalists_wd_nuts_pop_df,
                                              .f = function(x) sum(is.na(x)))) %>% 
  dplyr::mutate(share_missing = total_missing/total) %>% 
  mutate(share_missing = scales::percent(x = share_missing)) %>% 
  knitr::kable()
columns total total_missing share_missing
event_link 2401 0 0.000%
event_wikidata_id 2401 0 0.000%
event_name 2401 0 0.000%
event_sport_wikidata_id 2401 0 0.000%
event_sport 2401 0 0.000%
event_part_of_wikidata_id 2401 0 0.000%
event_part_of 2401 0 0.000%
medal 2401 0 0.000%
medalist_link 2401 0 0.000%
medalist_wikidata_id 2401 0 0.000%
medalist_name 2401 0 0.000%
delegation_link 2401 0 0.000%
delegation_wikidata_id 2401 0 0.000%
delegation_name 2401 0 0.000%
country_medal_wikidata_id 2401 146 6.081%
country_medal 2401 146 6.081%
date_of_birth 2401 8 0.333%
sex_or_gender_wikidata_id 2401 1 0.042%
sex_or_gender 2401 1 0.042%
place_of_birth_wikidata_id 2401 63 2.624%
place_of_birth 2401 69 2.874%
place_of_birth_coordinates 2401 74 3.082%
lat 2401 74 3.082%
lon 2401 74 3.082%
place_of_birth_located_in_wikidata_id 2401 110 4.581%
place_of_birth_located_in 2401 128 5.331%
nuts0_id 2401 1573 65.514%
nuts0_name 2401 1573 65.514%
nuts1_id 2401 1573 65.514%
nuts1_name 2401 1573 65.514%
nuts2_id 2401 1573 65.514%
nuts2_name 2401 1573 65.514%
nuts2_population 2401 1573 65.514%
nuts2_gdp 2401 1709 71.179%
nuts3_id 2401 1573 65.514%
nuts3_name 2401 1573 65.514%
nuts3_population 2401 1573 65.514%
nuts3_gdp 2401 1710 71.220%

The share of missing data for NUTS seems high, but this is expected as non-European medalists are mostly not born within a European NUTS region. Only of handful of European medalists do nothave information on their place of birth.

Get the data

all_medalists_wd_nuts_pop_df %>% 
  readr::write_csv(file = "medalists_all.csv")

all_medalists_wd_nuts_pop_df %>%
  dplyr::filter(is.na(nuts0_id)==FALSE) %>% 
  readr::write_csv(file = "medalists_nuts_only.csv")

all_medalists_wd_nuts_pop_df %>%
  dplyr::filter(is.na(place_of_birth_wikidata_id)==TRUE) %>% 
  dplyr::select(event_name, medal, medalist_name, medalist_wikidata_id, country_medal, delegation_name, sex_or_gender) %>% 
  readr::write_csv(file = "medalists_missing_place_of_birth.csv")


all_medalists_wd_nuts_pop_df %>% 
  dplyr::filter(is.na(nuts2_name)==FALSE) %>% 
  dplyr::group_by(nuts2_name) %>% 
  dplyr::count(name = "medals", sort = TRUE) %>% 
  dplyr::ungroup() %>% 
  dplyr::left_join(y = all_medalists_wd_nuts_pop_df %>% 
                     dplyr::distinct(nuts2_id, nuts2_name, nuts2_population, nuts0_name), by = "nuts2_name") %>% 
  dplyr::mutate(medals_per_million_residents = 1e06/(nuts2_population/medals)) %>% 
  dplyr::arrange(dplyr::desc(medals_per_million_residents)) %>% 
  dplyr::transmute(nuts2_id, 
                   nuts2_name,
                   country = nuts0_name,
                   medals, 
                   nuts2_population,
                   medals_per_million_residents) %>% 
  readr::write_csv(file = "medals_per_million_residents_in_nuts2.csv")

You can download the dataset from the repository including this script, or directly clicking on this link.

Here is the dataset for only medalists with place of birth in a NUTS region.

Here is the dataset for only medalists with no known place of birth in Wikidata (a good starting place to contribute to Wikidata).

See this blog post introducing the dataset, with some preliminary results about which NUTS 2 got more medals.

Update global map

library("dplyr", warn.conflicts = FALSE)
library("leaflet")

all_medalists_df <- readr::read_csv("medalists_all.csv", show_col_types = FALSE) %>% 
  dplyr::filter(is.na(lon)==FALSE) 
# %>% 
#   dplyr::mutate(img_filename = purrr::map_chr(.x = medalist_wikidata_id,
#                                               function(x) {tw_get_image(id = x)[1]})) 






all_medalists_ll <- all_medalists_df  %>% 
  dplyr::select(lat, lon, medal, medalist_wikidata_id, medalist_name, medalist_link, event_part_of, delegation_name, place_of_birth
                #, img_filename
  ) %>% 
  dplyr::mutate(medal_icon_link = dplyr::case_when(medal == "gold" ~ "https://upload.wikimedia.org/wikipedia/commons/4/4f/Gold_medal_olympic.svg",
                                                   medal == "silver" ~ "https://upload.wikimedia.org/wikipedia/commons/6/67/Silver_medal_olympic.svg",
                                                   medal == "bronze" ~ "https://upload.wikimedia.org/wikipedia/commons/f/f9/Bronze_medal_olympic.svg")) %>% 
  dplyr::group_by(medalist_wikidata_id) %>% 
  dplyr::mutate(medal = stringr::str_c(medal,collapse = ", "), event_part_of = stringr::str_c(unique(event_part_of), collapse = ", ")) %>% 
  dplyr::mutate(popup_content = stringr::str_c(
    "<big><b><a href='", medalist_link, "' target='_blank'>", medalist_name, "</a></b><br />",
    "Born in ", place_of_birth,"<br />",
    "Won ", medal, " medal in ", event_part_of, " for ", stringr::str_remove(string = delegation_name, pattern = " at the 2020 Summer Olympics"), "</big>"
    #,
    #    "<a href='https://en.wikipedia.org", medalist_link, "'>", "<img src='", "https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/", img_filename,  "&width=120'></a>"
    
  )) %>% 
  dplyr::ungroup() %>% 
  dplyr::group_by(lat, lon) %>% 
  dplyr::add_count(name = "n_born_same_place") %>% 
  dplyr::mutate(lat = dplyr::if_else(condition = n_born_same_place>1,
                                     true = jitter(x = lat, amount = 0.01),
                                     false = lat,
                                     missing = as.numeric(NA))) %>% 
  dplyr::mutate(lon = dplyr::if_else(condition = n_born_same_place>1,
                                     true = jitter(x = lon, amount = 0.01),
                                     false = lon,
                                     missing = as.numeric(NA)))

# remove the following line if you're running it on your own:
# <script defer data-domain=\"edjnet.github.io\" src=\"https://plausible.europeandatajournalism.eu/js/plausible.js\"></script>
# the first part is necessary for responsive popups on mobile

responsiveness_and_stats <- "\'<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\"><script defer data-domain=\"edjnet.github.io\" src=\"https://plausible.europeandatajournalism.eu/js/plausible.js\"></script>\'"

leaflet_medals <- all_medalists_ll %>% 
  leaflet() %>%
  leaflet::addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}{r}.png") %>% 
  leaflet::addMarkers(
    lng = all_medalists_ll$lon,
    lat = all_medalists_ll$lat,
    popup = all_medalists_ll$popup_content,
    icon = ~ icons(
      iconUrl = medal_icon_link,
      iconWidth = 24, iconHeight = 24,
    )
  ) %>% 
  htmlwidgets::onRender(paste0("
    function(el, x) {
      $('head').append(",responsiveness_and_stats,");
    }"))




htmlwidgets::saveWidget(leaflet_medals, file="medalists_map.html",
                        title = "Medalists at the 2020 Summer Olympics by place of birth",
                        selfcontained = TRUE)

leaflet_medals

You can see this interactive map in full screen following this link.

place_of_birth_table <- readr::read_csv("medalists_all.csv", show_col_types = FALSE) %>% 
  dplyr::distinct(medalist_wikidata_id, lon) %>% 
  mutate(place_of_birth_coordinates = is.na(lon)==FALSE) %>% 
  summarise(place_of_birth = sum(is.na(lon)==FALSE), missing_place_of_birth = sum(is.na(lon)==TRUE)) %>% 
  mutate(share_present = scales::percent(1-(missing_place_of_birth/place_of_birth)))

At last update, place of birth is present for 97% of medalists, so only 72 medalists out of 2175 are missing from this map.

place_of_birth_table %>% 
  knitr::kable()
place_of_birth missing_place_of_birth share_present
2103 72 97%
# re-check from Wikidata missing data to see if they have been added
readr::read_csv("medalists_missing_place_of_birth.csv", show_col_types = FALSE) %>% 
  dplyr::pull(medalist_wikidata_id) %>% 
  tw_get(overwrite_cache = TRUE)