This scripts makes use of some of the most common packages for R for data wrangling, scraping, and geocomputation.
It also makes use of two additional packages created within the scope of EDJNet:
tidywikidatar
, for getting data out Wikidata. It’s on CRAN, and you can can install it with install.packages("tidywikidatar")
. Make sure to have at least version 0.4.1 (it should be on CRAN soon), or download the latest version from GitHub. More info about the package on its own website.latlon2map
, for caching locally geographic datasets distributed by the European Union, for some convenience functions for geo-matching, and for interactive exploration of the dataset. You can install it from GitHub - remotes::install_github("giocomai/latlon2map")
, and find more information about it on its own website.By default, all data will be cached inside the active folder (likely, your project folder if you have cloned this with the likest of Rstudio). If you intend to use tidywikidatar
or latlon2map
in other projects, you may want to set the cache in a shared folder, as suggested in the code commented out below.
knitr::opts_chunk$set(echo = TRUE)
library("dplyr")
library("rvest")
library("stringr")
library("sf")
library("jsonlite")
library("purrr")
# remotes::install_github("EDJNet/tidywikidatar")
library("tidywikidatar")
tw_enable_cache()
# tw_set_cache_folder(path = fs::path(fs::path_home_r(), "R", "tw_data"))
tw_set_language(language = "en")
tw_create_cache_folder(ask = FALSE)
# remotes::install_github("giocomai/latlon2map")
library("latlon2map")
# ll_set_folder(path = fs::path(fs::path_home_r(), "R", "ll_data"))
options(timeout = 60000) # give the time to download the needed datasets
We’ll be getting all medals from this Wikipedia page. If you cloned this repository, you’ll find a version of this page cached locally. If you want to update it, just delete the local all_medals_wiki.html
and re-download the latest version.
In this table, each row is a competition, each cell corresponds to one or more medal winners (for team sports).
if (fs::file_exists("all_medals_wiki.html")==FALSE) {
download.file(url = "https://en.wikipedia.org/wiki/List_of_2020_Summer_Olympics_medal_winners",
destfile = "all_medals_wiki.html")
}
all_medals_page <- read_html(x = "all_medals_wiki.html")
all_medal_tables <- all_medals_page %>%
html_nodes(css = "table")
Here I create some functions that I will use to extract data from the table. It’s a bit of a pain to extract data consistently, including relevant links, and there are likely easier way to do this, but this seems to work alright.
I extract separately athletes, and the country they won their medal for, and then pull the data together.
o20_get_medalist_from_cell <- function(row,
td_number) {
links <- row %>%
html_nodes("td") %>%
.[[td_number]] %>%
html_nodes("a") %>%
html_attr("href")
if (length(links)==0) {
list(links)
} else {
links[stringr::str_detect(string = links,
pattern = "_at_the_2020_Summer_Olympics",
negate = TRUE)] %>%
list()
}
}
o20_get_country_from_cell <- function(row, td_number) {
links <- row %>%
html_nodes("td") %>%
.[[td_number]] %>%
html_nodes("a") %>%
html_attr("href")
if (length(links)==0) {
as.character(NA)
} else {
links[stringr::str_detect(string = links,
pattern = "_at_the_2020_Summer_Olympics",
negate = FALSE)]
}
}
o20_get_medals_from_table <- function(table) {
#print(class(table))
table_colnames <- table %>%
html_table() %>%
colnames()
if (!(length(table_colnames)==4|length(table_colnames)==7|length(table_colnames)==6)) {
return(NULL)
}
col4 <- table %>%
html_table() %>%
colnames() %>%
.[4]
if (col4 != "Bronze" & col4 != "Silver") {
return(NULL)
}
current_table <- table %>%
html_nodes("tr") %>%
.[-1] #remove header
purrr::map_dfr(
.x = current_table,
.f = function(current_row) {
# print(current_row %>%
# html_nodes("td") %>%
# .[[1]] %>% html_text())
col_number <- current_row %>%
html_nodes("td") %>%
length()
if (col_number==4) {
medal_row_df <- tibble::tibble(event_link = current_row %>%
html_nodes("td") %>%
.[[1]] %>%
html_nodes("a") %>%
html_attr("href"),
gold_medalist = current_row %>%
o20_get_medalist_from_cell(td_number = 2),
gold_country = current_row %>%
o20_get_country_from_cell(td_number = 2),
silver_medalist = current_row %>%
o20_get_medalist_from_cell(td_number = 3),
silver_country = current_row %>%
o20_get_country_from_cell(td_number = 3),
bronze_medalist = current_row %>%
o20_get_medalist_from_cell(td_number = 4),
bronze_country = current_row %>%
o20_get_country_from_cell(td_number = 4),
)
} else if (col_number==7) {
medal_row_df <- tibble::tibble(event_link = current_row %>%
html_nodes("td") %>%
.[[1]] %>%
html_nodes("a") %>%
html_attr("href"),
gold_medalist = current_row %>%
o20_get_medalist_from_cell(td_number = 2),
gold_country = current_row %>%
o20_get_country_from_cell(td_number = 2),
silver_medalist = current_row %>%
o20_get_medalist_from_cell(td_number = 4),
silver_country = current_row %>%
o20_get_country_from_cell(td_number = 4),
bronze_medalist = current_row %>%
o20_get_medalist_from_cell(td_number = 6),
bronze_country = current_row %>%
o20_get_country_from_cell(td_number = 6)
)
} else if (col_number==6) {
medal_row_df <- tibble::tibble(event_link = current_row %>%
html_nodes("td") %>%
.[[1]] %>%
html_nodes("a") %>%
html_attr("href"),
gold_medalist = current_row %>%
o20_get_medalist_from_cell(td_number = 2),
gold_country = current_row %>%
o20_get_country_from_cell(td_number = 2),
silver_medalist = current_row %>%
o20_get_medalist_from_cell(td_number = 4),
silver_country = current_row %>%
o20_get_country_from_cell(td_number = 4),
bronze_medalist = current_row %>%
o20_get_medalist_from_cell(td_number = 5),
bronze_country = current_row %>%
o20_get_country_from_cell(td_number = 5)
)
} else {
return(NULL)
}
dplyr::bind_rows(medal_row_df %>%
dplyr::transmute(event_link,
medal = "gold",
medalist_link = gold_medalist,
delegation_link = gold_country),
medal_row_df %>%
dplyr::transmute(event_link,
medal = "silver",
medalist_link = silver_medalist,
delegation_link = silver_country),
medal_row_df %>%
dplyr::transmute(event_link,
medal = "bronze",
medalist_link = bronze_medalist,
delegation_link = bronze_country)
) %>%
tidyr::unnest(cols = "medalist_link") %>%
dplyr::mutate(medalist_link = as.character(medalist_link)) %>%
dplyr::filter(stringr::str_detect(string = medalist_link,
pattern = "#endnote",
negate = TRUE))
})
}
Once these functions are defined, then everything looks super-easy. Here’s how the data look like at this stage.
all_medalists_df_pre <- purrr::map_dfr(.x = all_medal_tables,
.f = function(table) {
o20_get_medals_from_table(table)
})
all_medalists_df_pre$medalist_link[all_medalists_df_pre$medalist_link == "/wiki/Gianmarco_Tamberi"&all_medalists_df_pre$delegation_link=="/wiki/Qatar_at_the_2020_Summer_Olympics"] <- "/wiki/Mutaz_Essa_Barshim"
all_medalists_df_pre$medalist_link[all_medalists_df_pre$medalist_link == "/wiki/Mutaz_Essa_Barshim"&all_medalists_df_pre$delegation_link=="/wiki/Italy_at_the_2020_Summer_Olympics"] <- "/wiki/Gianmarco_Tamberi"
## add manually bronze medals shared by two athletes... less than ideal, but not in the mood to fix the parser now
## update: I didn't realise there were so many double-bronze... a tiny fix above would have been much quicker, but anyways...
double_medals <- tibble::tribble(~event_link, ~medal, ~medalist_link, ~delegation_link,
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_flyweight", "bronze", "/wiki/Saken_Bibossinov", "/wiki/Kazakhstan_at_the_2020_Summer_Olympics", "/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_featherweight","bronze", "/wiki/L%C3%A1zaro_%C3%81lvarez", "/wiki/Cuba_at_the_2020_Summer_Olympics",
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_lightweight", "bronze", "/wiki/Harry_Garside", "/wiki/Australia_at_the_2020_Summer_Olympics",
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_welterweight", "bronze", "/wiki/Aidan_Walsh_(boxer)", "/wiki/Ireland_at_the_2020_Summer_Olympics",
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_middleweight", "bronze", "/wiki/Gleb_Bakshi", "/wiki/ROC_at_the_2020_Summer_Olympics",
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_light_heavyweight", "bronze", "/wiki/Imam_Khataev", "/wiki/ROC_at_the_2020_Summer_Olympics",
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_heavyweight", "bronze", "/wiki/Abner_Teixeira", "/wiki/Brazil_at_the_2020_Summer_Olympics",
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_super_heavyweight", "bronze", "/wiki/Kamshybek_Kunkabayev", "/wiki/Kazakhstan_at_the_2020_Summer_Olympics",
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_flyweight", "bronze", "/wiki/Tsukimi_Namiki", "/wiki/Japan_at_the_2020_Summer_Olympics",
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_featherweight", "bronze", "/wiki/Karriss_Artingstall", "/wiki/Great_Britain_at_the_2020_Summer_Olympics",
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_featherweight", "bronze", "/wiki/Irma_Testa", "/wiki/Italy_at_the_2020_Summer_Olympics",
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_lightweight", "bronze", "/wiki/Mira_Potkonen", "/wiki/Finland_at_the_2020_Summer_Olympics",
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_welterweight", "bronze", "/wiki/Oshae_Jones", "/wiki/United_States_at_the_2020_Summer_Olympics",
"/wiki/Boxing_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_middleweight", "bronze", "/wiki/Zemfira_Magomedalieva", "/wiki/ROC_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_60_kg", "bronze", "/wiki/Luka_Mkheidze", "/wiki/France_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_66_kg", "bronze", "/wiki/Daniel_Cargnin_(judoka)", "/wiki/Brazil_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_73_kg", "bronze", "/wiki/Tsend-Ochiryn_Tsogtbaatar", "/wiki/Mongolia_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_81_kg", "bronze", "/wiki/Matthias_Casse", "/wiki/Belgium_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_90_kg", "bronze", "/wiki/Kriszti%C3%A1n_T%C3%B3th", "/wiki/Hungary_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_100_kg", "bronze", "/wiki/Niyaz_Ilyasov", "/wiki/ROC_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_%2B100_kg", "bronze", "/wiki/Tamerlan_Bashaev", "/wiki/ROC_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_48_kg", "bronze", "/wiki/Urantsetseg_Munkhbat", "/wiki/Mongolia_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_52_kg", "bronze", "/wiki/Chelsie_Giles", "/wiki/Great_Britain_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_57_kg", "bronze", "/wiki/Tsukasa_Yoshida", "/wiki/Japan_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_63_kg", "bronze", "/wiki/Catherine_Beauchemin-Pinard", "/wiki/Canada_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_70_kg", "bronze", "/wiki/Sanne_van_Dijke", "/wiki/Netherlands_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_78_kg", "bronze", "/wiki/Mayra_Aguiar", "/wiki/Brazil_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_%2B78_kg", "bronze", "/wiki/Romane_Dicko", "/wiki/France_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Tohar_Butbul", "/wiki/Israel_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Raz_Hershko", "/wiki/Israel_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Li_Kochman", "/wiki/Israel_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Inbar_Lanir", "/wiki/Israel_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Sagi_Muki", "/wiki/Israel_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Timna_Nelson-Levy", "/wiki/Israel_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Peter_Paltchik", "/wiki/Israel_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Shira_Rishony", "/wiki/Israel_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Or_Sasson", "/wiki/Israel_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Gili_Sharir", "/wiki/Israel_at_the_2020_Summer_Olympics",
"/wiki/Judo_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_team", "bronze", "/wiki/Baruch_Shmailov", "/wiki/Israel_at_the_2020_Summer_Olympics",
"/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_kata", "bronze", "/wiki/Ali_Sofuo%C4%9Flu", "/wiki/Turkey_at_the_2020_Summer_Olympics",
"/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_67_kg", "bronze", "/wiki/Abdelrahman_Al-Masatfa", "/wiki/Jordan_at_the_2020_Summer_Olympics",
"/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_75_kg", "bronze", "/wiki/Stanislav_Horuna", "/wiki/Ukraine_at_the_2020_Summer_Olympics",
"/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_%2B75_kg", "bronze", "/wiki/U%C4%9Fur_Akta%C5%9F_(karateka)", "/wiki/Turkey_at_the_2020_Summer_Olympics",
"/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_kata", "bronze", "/wiki/Viviana_Bottaro", "/wiki/Italy_at_the_2020_Summer_Olympics",
"/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_55_kg", "bronze", "/wiki/Wen_Tzu-yun", "/wiki/Chinese_Taipei_at_the_2020_Summer_Olympics",
"/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_61_kg", "bronze", "/wiki/Merve_%C3%87oban", "/wiki/Turkey_at_the_2020_Summer_Olympics",
"/wiki/Karate_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_%2B61_kg", "bronze", "/wiki/Sofya_Berultseva", "/wiki/Ukraine_at_the_2020_Summer_Olympics",
"/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_58_kg", "bronze", "/wiki/Mikhail_Artamonov_(taekwondo)", "/wiki/ROC_at_the_2020_Summer_Olympics",
"/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_68_kg", "bronze", "/wiki/Zhao_Shuai", "/wiki/China_at_the_2020_Summer_Olympics",
"/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_80_kg", "bronze", "/wiki/Seif_Eissa", "/wiki/Egypt_at_the_2020_Summer_Olympics",
"/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_%2B80_kg", "bronze", "/wiki/Rafael_Alba", "/wiki/Cuba_at_the_2020_Summer_Olympics",
"/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_49_kg", "bronze", "/wiki/Avishag_Semberg", "/wiki/Israel_at_the_2020_Summer_Olympics",
"/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_57_kg", "bronze", "/wiki/Hatice_K%C3%BCbra_%C4%B0lg%C3%BCn", "/wiki/Turkey_at_the_2020_Summer_Olympics",
"/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_67_kg", "bronze", "/wiki/Hedaya_Wahba", "/wiki/Egypt_at_the_2020_Summer_Olympics",
"/wiki/Taekwondo_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_%2B67_kg", "bronze", "/wiki/Bianca_Walkden", "/wiki/Great_Britain_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_57_kg", "bronze", "/wiki/Thomas_Gilman", "/wiki/United_States_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_65_kg", "bronze", "/wiki/Bajrang_Punia", "/wiki/India_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_74_kg", "bronze", "/wiki/Bekzod_Abdurakhmonov", "/wiki/Uzbekistan_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_86_kg", "bronze", "/wiki/Myles_Amine", "/wiki/San_Marino_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_97_kg", "bronze", "/wiki/Abraham_Conyedo", "/wiki/Italy_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_125_kg", "bronze", "/wiki/Taha_Akg%C3%BCl", "/wiki/Turkey_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_60_kg", "bronze", "/wiki/Sergey_Emelin", "/wiki/ROC_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_67_kg", "bronze", "/wiki/Mohamed_Ibrahim_El-Sayed", "/wiki/Egypt_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_77_kg", "bronze", "/wiki/Rafig_Huseynov", "/wiki/Azerbaijan_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_87_kg", "bronze", "/wiki/Zurab_Datunashvili", "/wiki/Serbia_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_97_kg", "bronze", "/wiki/Mohammad_Hadi_Saravi", "/wiki/Iran_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_130_kg", "bronze", "/wiki/Sergey_Semenov_(wrestler)", "/wiki/ROC_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_50_kg", "bronze", "/wiki/Sarah_Hildebrandt", "/wiki/United_States_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_53_kg", "bronze", "/wiki/Bat-Ochiryn_Bolortuyaa", "/wiki/Mongolia_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_57_kg", "bronze", "/wiki/Evelina_Nikolova", "/wiki/Bulgaria_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_62_kg", "bronze", "/wiki/Taybe_Yusein", "/wiki/Bulgaria_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_68_kg", "bronze", "/wiki/Meerim_Zhumanazarova", "/wiki/Kyrgyzstan_at_the_2020_Summer_Olympics",
"/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_76_kg", "bronze", "/wiki/Zhou_Qian", "/wiki/China_at_the_2020_Summer_Olympics",
# fix Barshim moved to separate row
"/wiki/Athletics_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_high_jump", "gold", "/wiki/Mutaz_Essa_Barshim", "/wiki/Qatar_at_the_2020_Summer_Olympics",
# fix missing competition for whatever reason
"/wiki/Canoeing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_C-1_1000_metres", "gold", "/wiki/Isaquias_Queiroz", "/wiki/Brazil_at_the_2016_Summer_Olympics",
"/wiki/Canoeing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_C-1_1000_metres", "silver", "/wiki/Liu_Hao_(canoeist)", "/wiki/China_at_the_2016_Summer_Olympics",
"/wiki/Canoeing_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_C-1_1000_metres", "bronze", "/wiki/Serghei_Tarnovschi", "/wiki/Moldova_at_the_2016_Summer_Olympics",
)
all_medalists_df <- bind_rows(all_medalists_df_pre,
double_medals) %>%
dplyr::filter(stringr::str_detect(string = medalist_link,pattern = "#", negate = TRUE)) %>%
distinct(event_link, medalist_link, .keep_all = TRUE)
#### check:
# dplyr::group_by(event_link, medalist_link) %>%
# count() %>%
# dplyr::filter(n>1)
# nrow(all_medalists_df_pre)+nrow(double_medals)
all_medalists_df %>%
dplyr::slice_sample(n = 10) %>%
knitr::kable()
event_link | medal | medalist_link | delegation_link |
---|---|---|---|
/wiki/Rugby_sevens_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_tournament | silver | /wiki/Caroline_Drouin | /wiki/France_at_the_2020_Summer_Olympics |
/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_freestyle_57_kg | bronze | /wiki/Helen_Maroulis | /wiki/United_States_at_the_2020_Summer_Olympics |
/wiki/Swimming_at_the_2020_Summer_Olympics_%E2%80%93_Mixed_4_%C3%97_100_metre_medley_relay | gold | /wiki/Adam_Peaty | /wiki/Great_Britain_at_the_2020_Summer_Olympics |
/wiki/Swimming_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_4_%C3%97_100_metre_medley_relay | gold | /wiki/Emily_Seebohm | /wiki/Australia_at_the_2020_Summer_Olympics |
/wiki/Table_tennis_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_team | bronze | /wiki/Minnie_Soo_Wai_Yam | /wiki/Hong_Kong_at_the_2020_Summer_Olympics |
/wiki/Weightlifting_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_%2B109_kg | silver | /wiki/Ali_Davoudi | /wiki/Iran_at_the_2020_Summer_Olympics |
/wiki/Football_at_the_2020_Summer_Olympics_%E2%80%93_Women%27s_tournament | silver | /wiki/Amanda_Ilestedt | /wiki/Sweden_at_the_2020_Summer_Olympics |
/wiki/Cycling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_road_time_trial | gold | /wiki/Primo%C5%BE_Rogli%C4%8D | /wiki/Slovenia_at_the_2020_Summer_Olympics |
/wiki/Swimming_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_400_metre_freestyle | silver | /wiki/Jack_McLoughlin | /wiki/Australia_at_the_2020_Summer_Olympics |
/wiki/Softball_at_the_2020_Summer_Olympics | gold | /wiki/Nayu_Kiyohara | /wiki/Japan_at_the_2020_Summer_Olympics |
So we have links to Wikipedia pages, which is a good start. But Wikipedia does not really have much structured data, so we need to find the Wikidata identifier for each of those Wikipedia pages. We could download the full html of each of those pages and extract the link from there, but we’ll play it nicely and use the Wikidata API, and parse the json we obtain from it.
So we’ll cache locally the json file we get from the Wikipedia page, and and associated to each of those links a Wikidata identifier.
For reference, here’s how a random Wikipedia API page looks like:
fs::dir_create("wikipedia_medalists_json")
wiki_api_links <- stringr::str_c("https://en.wikipedia.org/w/api.php?action=query&redirects=true&prop=pageprops&titles=",
stringr::str_remove(all_medalists_df$medalist_link, "/wiki/"),
"&format=json")
local_json_files <- fs::path("wikipedia_medalists_json",
all_medalists_df$medalist_link %>%
fs::path_file() %>%
paste0(., ".json"))
sample(wiki_api_links, size = 1)
[1] "https://en.wikipedia.org/w/api.php?action=query&redirects=true&prop=pageprops&titles=Li_Bingjie&format=json"
o2020_get_wikidata_from_link <- function(link) {
wiki_api_links_country <- stringr::str_c("https://en.wikipedia.org/w/api.php?action=query&redirects=true&prop=pageprops&titles=",
stringr::str_remove(link, "/wiki/"),
"&format=json")
local_json_files_country <- fs::path("wikipedia_medalists_json",
link %>%
fs::path_file() %>%
paste0(., ".json"))
wikidata_id <- purrr::map2_chr(
.x = wiki_api_links_country,
.y = local_json_files_country,
.f = function(x, y) {
if (fs::file_exists(y)==FALSE) {
download.file(url = x, destfile = y)
Sys.sleep(1)
}
jsonlite::read_json(path = y)
output <- jsonlite::read_json(path = y) %>%
purrr::pluck("query",
"pages",
1,
"pageprops",
"wikibase_item")
if (length(output)==0) {
as.character(NA)
} else {
output
}
})
#tibble::tibble(link = link, wikidata_id, wikidata_id)
wikidata_id
}
## check
# nrow(all_medalists_df)==length(wikidata_id)
# wiki_api_links[is.na(wikidata_id_medalists)]
I will keep the cached json files in the repo, so you don’t have to download them again; the Wikidata identifier should not change anyway.
all_medalists_wd_df <- all_medalists_df %>%
dplyr::mutate(medalist_wikidata_id = o2020_get_wikidata_from_link(medalist_link),
event_wikidata_id = o2020_get_wikidata_from_link(event_link),
delegation_wikidata_id = o2020_get_wikidata_from_link(delegation_link)) %>%
dplyr::mutate(event_link = paste0("https://en.wikipedia.org", event_link),
medalist_link = paste0("https://en.wikipedia.org", medalist_link),
delegation_link = paste0("https://en.wikipedia.org", delegation_link))
# due to frequent failures, consider something like this at first run
while (length(fs::dir_ls("wikipedia_medalists_json")<2600)) {
try(all_medalists_wd_df <- all_medalists_df %>%
dplyr::mutate(medalist_wikidata_id = o2020_get_wikidata_from_link(medalist_link),
event_wikidata_id = o2020_get_wikidata_from_link(event_link),
delegation_wikidata_id = o2020_get_wikidata_from_link(delegation_link)) %>%
dplyr::mutate(event_link = paste0("https://en.wikipedia.org", event_link),
medalist_link = paste0("https://en.wikipedia.org", medalist_link),
delegation_link = paste0("https://en.wikipedia.org", delegation_link)))
}
all_medalists_wd_df %>%
dplyr::slice_sample(n = 10) %>%
knitr::kable()
Are these all Olympics medalists? We now have 2175 and 339 events, which correspond to the figure given in the opening line of the Wikipedia page used as the source for this endeavour.
Apparently, yes.
So now we have successfully associated Wikidata identifiers with the relevant Wikipedia pages. Let the Wikidata-based fun begin.
Now that we have a Wikidata id, we’re on a downward slope to get a lot more data without much pain. Look at the beauty of the massive piped call that follows, with each line getting more and more data.
all_medalists_with_wd_details_df <- all_medalists_wd_df %>%
mutate(medalist_name = tw_get_label(medalist_wikidata_id),
place_of_birth_wikidata_id = tw_get_property_same_length(id = medalist_wikidata_id,
p = "P19",
only_first = TRUE,
preferred = TRUE),
date_of_birth = tw_get_property_same_length(id = medalist_wikidata_id,
p = "P569",
only_first = TRUE,
preferred = TRUE),
event_sport_wikidata_id = tw_get_property_same_length(id = event_wikidata_id,
p = "P361",
only_first = TRUE,
preferred = TRUE),
event_part_of_wikidata_id = tw_get_property_same_length(id = event_wikidata_id,
p = "P641",
only_first = TRUE,
preferred = TRUE),
event_name = tw_get_label(event_wikidata_id),
delegation_name = tw_get_label(delegation_wikidata_id),
sex_or_gender_wikidata_id = tw_get_property_same_length(id = medalist_wikidata_id,
p = "P21",
only_first = TRUE,
preferred = TRUE)) %>%
mutate(country_medal_wikidata_id = tw_get_property_same_length(id = delegation_wikidata_id,
p = "P17",
only_first = TRUE,
preferred = TRUE),
event_sport = tw_get_label(event_sport_wikidata_id),
event_part_of = tw_get_label(event_part_of_wikidata_id),
place_of_birth = tw_get_label(place_of_birth_wikidata_id),
place_of_birth_located_in_wikidata_id = tw_get_property_same_length(id = place_of_birth_wikidata_id,
p = "P131",
only_first = TRUE,
preferred = TRUE),
place_of_birth_coordinates = tw_get_property_same_length(id = place_of_birth_wikidata_id,
p = "P625",
only_first = TRUE,
preferred = TRUE),
date_of_birth = stringr::str_extract(string = date_of_birth,
pattern = "[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}") %>%
as.Date(),
sex_or_gender = tw_get_label(sex_or_gender_wikidata_id)
) %>%
tidyr::separate(col = place_of_birth_coordinates,
into = c("lat", "lon"),
sep = ",",
remove = FALSE) %>%
mutate(country_medal = tw_get_label(country_medal_wikidata_id),
place_of_birth_located_in = tw_get_label(place_of_birth_located_in_wikidata_id))
Now, life’s not perfect, and so isn’t Wikidata. Some athletes do not have a complete file on them on Wikidata, so we may miss some places of birth, for example. Also, in order to reduce complexity of the output, I have limited retrieval of some properties to the first result. For example, Wikidata knows that a place is located within more than one “administrative territorial entity” (or was at different points in time), but I just kept the first result to keep the data in tabular format and reduce complexity.
To aggregate results consistently at the level of region/province/state, I will rely on geographic datasets and match the coordinates with the place of birth. For the time being, I’ll do this only for Europe (all countries which are part of the NUTS standard). It should be relatively easy to expand to other countries if there’s a geographic dataset to do the matching, but this is a start, and data extracted from Wikidata will have to do for the rest of the world for the time being.
So let’s match the coordinates of the place of birth of each medalist with the geographic dataset of NUTS regions.
all_medalists_wd_nuts_df <- all_medalists_with_wd_details_df %>%
dplyr::left_join(
all_medalists_with_wd_details_df %>%
dplyr::filter(is.na(lon)==FALSE) %>%
dplyr::distinct(medalist_wikidata_id, .keep_all = TRUE) %>%
sf::st_as_sf(coords = c("lon","lat"), crs = 4326) %>%
sf::st_join(latlon2map::ll_get_nuts_eu(level = 1, year = 2016),
join = sf::st_intersects) %>%
st_drop_geometry() %>%
dplyr::transmute(medalist_wikidata_id,
nuts1_id = NUTS_ID,
nuts1_name = NAME_LATN),
by = "medalist_wikidata_id") %>%
dplyr::left_join(
all_medalists_with_wd_details_df %>%
dplyr::filter(is.na(lon)==FALSE) %>%
dplyr::distinct(medalist_wikidata_id, .keep_all = TRUE) %>%
sf::st_as_sf(coords = c("lon","lat"), crs = 4326) %>%
sf::st_join(latlon2map::ll_get_nuts_eu(level = 2, year = 2016),
join = sf::st_intersects) %>%
st_drop_geometry() %>%
dplyr::transmute(medalist_wikidata_id,
nuts2_id = NUTS_ID,
nuts2_name = NAME_LATN),
by = "medalist_wikidata_id") %>%
dplyr::left_join(
all_medalists_with_wd_details_df %>%
dplyr::filter(is.na(lon)==FALSE) %>%
dplyr::distinct(medalist_wikidata_id, .keep_all = TRUE) %>%
sf::st_as_sf(coords = c("lon","lat"), crs = 4326) %>%
sf::st_join(latlon2map::ll_get_nuts_eu(level = 3, year = 2016),
join = sf::st_intersects) %>%
st_drop_geometry() %>%
dplyr::transmute(medalist_wikidata_id,
nuts3_id = NUTS_ID,
nuts3_name = NAME_LATN),
by = "medalist_wikidata_id")
https://ec.europa.eu/eurostat/web/products-datasets/product?code=tgs00096
library("eurostat")
fs::dir_create("eurostat")
population_by_nuts2_2020_file <- fs::path("eurostat",
"population_by_nuts2_2020.csv")
if (fs::file_exists(population_by_nuts2_2020_file)==FALSE) {
eurostat::get_eurostat(id = "tgs00096", filters = list(age = "TOTAL",
sex = "T")) %>%
dplyr::arrange(geo, dplyr::desc(time)) %>%
dplyr::filter(is.na(values)==FALSE) %>%
dplyr::group_by(geo) %>%
dplyr::slice(1) %>%
dplyr::ungroup() %>%
tidyr::drop_na() %>%
dplyr::transmute(nuts2_id = geo, nuts2_population = values) %>%
dplyr::filter(nuts2_population!=0) %>%
readr::write_csv(population_by_nuts2_2020_file)
}
population_by_nuts2 <- readr::read_csv(file = population_by_nuts2_2020_file, show_col_types = FALSE)
population_by_nuts3_2020_file <- fs::path("eurostat",
"population_by_nuts3_2020.csv")
if (fs::file_exists(population_by_nuts3_2020_file)==FALSE) {
eurostat::get_eurostat(id = "demo_r_pjanaggr3",
filters = list(age = "TOTAL",
sex = "T")) %>%
dplyr::arrange(geo, dplyr::desc(time)) %>%
dplyr::filter(is.na(values)==FALSE) %>%
dplyr::group_by(geo) %>%
dplyr::slice(1) %>%
dplyr::ungroup() %>%
tidyr::drop_na() %>%
dplyr::transmute(nuts3_id = geo, nuts3_population = values) %>%
dplyr::filter(nuts3_population!=0) %>%
dplyr::filter(nchar(nuts3_id)==5) %>%
readr::write_csv(population_by_nuts3_2020_file)
}
population_by_nuts3 <- readr::read_csv(file = population_by_nuts3_2020_file, show_col_types = FALSE)
if (fs::file_exists(fs::path("eurostat",
"gdp_by_nuts3_2019.csv"))==FALSE) {
gdp_by_nuts_df <- eurostat::get_eurostat(id = "nama_10r_3gdp",
filters = list(unit = "MIO_EUR")) %>%
dplyr::arrange(geo, dplyr::desc(time)) %>%
dplyr::filter(is.na(values)==FALSE) %>%
dplyr::group_by(geo) %>%
dplyr::slice(1) %>%
dplyr::ungroup() %>%
tidyr::drop_na() %>%
dplyr::filter(values !=0)
gdp_by_nuts_df %>%
dplyr::transmute(nuts2_id = geo, nuts2_gdp = values) %>%
dplyr::filter(nchar(nuts2_id)==4) %>%
readr::write_csv(file = fs::path("eurostat",
"gdp_by_nuts2_2019.csv"))
gdp_by_nuts_df %>%
dplyr::transmute(nuts3_id = geo, nuts3_gdp = values) %>%
dplyr::filter(nchar(nuts3_id)==5) %>%
readr::write_csv(file = fs::path("eurostat",
"gdp_by_nuts3_2019.csv"))
}
gdp_by_nuts2_2019_df <- readr::read_csv(fs::path("eurostat",
"gdp_by_nuts2_2019.csv"),
show_col_types = FALSE)
gdp_by_nuts3_2019_df <- readr::read_csv(fs::path("eurostat",
"gdp_by_nuts3_2019.csv"),
show_col_types = FALSE)
all_medalists_wd_nuts_pop_df <- all_medalists_wd_nuts_df %>%
dplyr::left_join(y = population_by_nuts2, by = "nuts2_id") %>%
dplyr::left_join(y = population_by_nuts3, by = "nuts3_id") %>%
dplyr::left_join(y = gdp_by_nuts2_2019_df, by = "nuts2_id") %>%
dplyr::left_join(y = gdp_by_nuts3_2019_df, by = "nuts3_id") %>%
dplyr::mutate(nuts0_id = stringr::str_extract(string = nuts2_id, pattern = "[A-Z]{2}")) %>%
dplyr::mutate(nuts0_name = dplyr::case_when(is.na(nuts0_id) ~ as.character(NA),
nuts0_id == "EL" ~ "Greece",
nuts0_id == "UK" ~ "United Kingdom",
TRUE ~ countrycode::countrycode(sourcevar = nuts0_id,
origin = "iso2c",
destination = "country.name"))) %>%
dplyr::select(event_link,
event_wikidata_id,
event_name,
event_sport_wikidata_id,
event_sport,
event_part_of_wikidata_id,
event_part_of,
medal,
medalist_link,
medalist_wikidata_id,
medalist_name,
delegation_link,
delegation_wikidata_id,
delegation_name,
country_medal_wikidata_id,
country_medal,
date_of_birth,
sex_or_gender_wikidata_id,
sex_or_gender,
place_of_birth_wikidata_id,
place_of_birth,
place_of_birth_coordinates,
lat,
lon,
place_of_birth_located_in_wikidata_id,
place_of_birth_located_in,
nuts0_id,
nuts0_name,
nuts1_id,
nuts1_name,
nuts2_id,
nuts2_name,
nuts2_population,
nuts2_gdp,
nuts3_id,
nuts3_name,
nuts3_population,
nuts3_gdp)
Let’s check how many missing data we have for each column of data:
tibble::tibble(columns = colnames(all_medalists_wd_nuts_pop_df),
total = nrow(all_medalists_wd_nuts_pop_df),
total_missing = purrr::map_dbl(.x = all_medalists_wd_nuts_pop_df,
.f = function(x) sum(is.na(x)))) %>%
dplyr::mutate(share_missing = total_missing/total) %>%
mutate(share_missing = scales::percent(x = share_missing)) %>%
knitr::kable()
columns | total | total_missing | share_missing |
---|---|---|---|
event_link | 2401 | 0 | 0.000% |
event_wikidata_id | 2401 | 0 | 0.000% |
event_name | 2401 | 0 | 0.000% |
event_sport_wikidata_id | 2401 | 0 | 0.000% |
event_sport | 2401 | 0 | 0.000% |
event_part_of_wikidata_id | 2401 | 0 | 0.000% |
event_part_of | 2401 | 0 | 0.000% |
medal | 2401 | 0 | 0.000% |
medalist_link | 2401 | 0 | 0.000% |
medalist_wikidata_id | 2401 | 0 | 0.000% |
medalist_name | 2401 | 0 | 0.000% |
delegation_link | 2401 | 0 | 0.000% |
delegation_wikidata_id | 2401 | 0 | 0.000% |
delegation_name | 2401 | 0 | 0.000% |
country_medal_wikidata_id | 2401 | 146 | 6.081% |
country_medal | 2401 | 146 | 6.081% |
date_of_birth | 2401 | 8 | 0.333% |
sex_or_gender_wikidata_id | 2401 | 1 | 0.042% |
sex_or_gender | 2401 | 1 | 0.042% |
place_of_birth_wikidata_id | 2401 | 63 | 2.624% |
place_of_birth | 2401 | 69 | 2.874% |
place_of_birth_coordinates | 2401 | 74 | 3.082% |
lat | 2401 | 74 | 3.082% |
lon | 2401 | 74 | 3.082% |
place_of_birth_located_in_wikidata_id | 2401 | 110 | 4.581% |
place_of_birth_located_in | 2401 | 128 | 5.331% |
nuts0_id | 2401 | 1573 | 65.514% |
nuts0_name | 2401 | 1573 | 65.514% |
nuts1_id | 2401 | 1573 | 65.514% |
nuts1_name | 2401 | 1573 | 65.514% |
nuts2_id | 2401 | 1573 | 65.514% |
nuts2_name | 2401 | 1573 | 65.514% |
nuts2_population | 2401 | 1573 | 65.514% |
nuts2_gdp | 2401 | 1709 | 71.179% |
nuts3_id | 2401 | 1573 | 65.514% |
nuts3_name | 2401 | 1573 | 65.514% |
nuts3_population | 2401 | 1573 | 65.514% |
nuts3_gdp | 2401 | 1710 | 71.220% |
The share of missing data for NUTS seems high, but this is expected as non-European medalists are mostly not born within a European NUTS region. Only of handful of European medalists do nothave information on their place of birth.
all_medalists_wd_nuts_pop_df %>%
readr::write_csv(file = "medalists_all.csv")
all_medalists_wd_nuts_pop_df %>%
dplyr::filter(is.na(nuts0_id)==FALSE) %>%
readr::write_csv(file = "medalists_nuts_only.csv")
all_medalists_wd_nuts_pop_df %>%
dplyr::filter(is.na(place_of_birth_wikidata_id)==TRUE) %>%
dplyr::select(event_name, medal, medalist_name, medalist_wikidata_id, country_medal, delegation_name, sex_or_gender) %>%
readr::write_csv(file = "medalists_missing_place_of_birth.csv")
all_medalists_wd_nuts_pop_df %>%
dplyr::filter(is.na(nuts2_name)==FALSE) %>%
dplyr::group_by(nuts2_name) %>%
dplyr::count(name = "medals", sort = TRUE) %>%
dplyr::ungroup() %>%
dplyr::left_join(y = all_medalists_wd_nuts_pop_df %>%
dplyr::distinct(nuts2_id, nuts2_name, nuts2_population, nuts0_name), by = "nuts2_name") %>%
dplyr::mutate(medals_per_million_residents = 1e06/(nuts2_population/medals)) %>%
dplyr::arrange(dplyr::desc(medals_per_million_residents)) %>%
dplyr::transmute(nuts2_id,
nuts2_name,
country = nuts0_name,
medals,
nuts2_population,
medals_per_million_residents) %>%
readr::write_csv(file = "medals_per_million_residents_in_nuts2.csv")
You can download the dataset from the repository including this script, or directly clicking on this link.
Here is the dataset for only medalists with place of birth in a NUTS region.
Here is the dataset for only medalists with no known place of birth in Wikidata (a good starting place to contribute to Wikidata).
See this blog post introducing the dataset, with some preliminary results about which NUTS 2 got more medals.
library("dplyr", warn.conflicts = FALSE)
library("leaflet")
all_medalists_df <- readr::read_csv("medalists_all.csv", show_col_types = FALSE) %>%
dplyr::filter(is.na(lon)==FALSE)
# %>%
# dplyr::mutate(img_filename = purrr::map_chr(.x = medalist_wikidata_id,
# function(x) {tw_get_image(id = x)[1]}))
all_medalists_ll <- all_medalists_df %>%
dplyr::select(lat, lon, medal, medalist_wikidata_id, medalist_name, medalist_link, event_part_of, delegation_name, place_of_birth
#, img_filename
) %>%
dplyr::mutate(medal_icon_link = dplyr::case_when(medal == "gold" ~ "https://upload.wikimedia.org/wikipedia/commons/4/4f/Gold_medal_olympic.svg",
medal == "silver" ~ "https://upload.wikimedia.org/wikipedia/commons/6/67/Silver_medal_olympic.svg",
medal == "bronze" ~ "https://upload.wikimedia.org/wikipedia/commons/f/f9/Bronze_medal_olympic.svg")) %>%
dplyr::group_by(medalist_wikidata_id) %>%
dplyr::mutate(medal = stringr::str_c(medal,collapse = ", "), event_part_of = stringr::str_c(unique(event_part_of), collapse = ", ")) %>%
dplyr::mutate(popup_content = stringr::str_c(
"<big><b><a href='", medalist_link, "' target='_blank'>", medalist_name, "</a></b><br />",
"Born in ", place_of_birth,"<br />",
"Won ", medal, " medal in ", event_part_of, " for ", stringr::str_remove(string = delegation_name, pattern = " at the 2020 Summer Olympics"), "</big>"
#,
# "<a href='https://en.wikipedia.org", medalist_link, "'>", "<img src='", "https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/", img_filename, "&width=120'></a>"
)) %>%
dplyr::ungroup() %>%
dplyr::group_by(lat, lon) %>%
dplyr::add_count(name = "n_born_same_place") %>%
dplyr::mutate(lat = dplyr::if_else(condition = n_born_same_place>1,
true = jitter(x = lat, amount = 0.01),
false = lat,
missing = as.numeric(NA))) %>%
dplyr::mutate(lon = dplyr::if_else(condition = n_born_same_place>1,
true = jitter(x = lon, amount = 0.01),
false = lon,
missing = as.numeric(NA)))
# remove the following line if you're running it on your own:
# <script defer data-domain=\"edjnet.github.io\" src=\"https://plausible.europeandatajournalism.eu/js/plausible.js\"></script>
# the first part is necessary for responsive popups on mobile
responsiveness_and_stats <- "\'<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\"><script defer data-domain=\"edjnet.github.io\" src=\"https://plausible.europeandatajournalism.eu/js/plausible.js\"></script>\'"
leaflet_medals <- all_medalists_ll %>%
leaflet() %>%
leaflet::addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}{r}.png") %>%
leaflet::addMarkers(
lng = all_medalists_ll$lon,
lat = all_medalists_ll$lat,
popup = all_medalists_ll$popup_content,
icon = ~ icons(
iconUrl = medal_icon_link,
iconWidth = 24, iconHeight = 24,
)
) %>%
htmlwidgets::onRender(paste0("
function(el, x) {
$('head').append(",responsiveness_and_stats,");
}"))
htmlwidgets::saveWidget(leaflet_medals, file="medalists_map.html",
title = "Medalists at the 2020 Summer Olympics by place of birth",
selfcontained = TRUE)
leaflet_medals
You can see this interactive map in full screen following this link.
place_of_birth_table <- readr::read_csv("medalists_all.csv", show_col_types = FALSE) %>%
dplyr::distinct(medalist_wikidata_id, lon) %>%
mutate(place_of_birth_coordinates = is.na(lon)==FALSE) %>%
summarise(place_of_birth = sum(is.na(lon)==FALSE), missing_place_of_birth = sum(is.na(lon)==TRUE)) %>%
mutate(share_present = scales::percent(1-(missing_place_of_birth/place_of_birth)))
At last update, place of birth is present for 97% of medalists, so only 72 medalists out of 2175 are missing from this map.
place_of_birth | missing_place_of_birth | share_present |
---|---|---|
2103 | 72 | 97% |