library('rlist')
suppressMessages(library('dplyr'))
library('rvest')
library('xml2')
library('DT')
Research Computing Organizations
Simple R script to pull GitHub profiles matching for research computing in the organization name
## Search Research Computing organizations
<- "https://api.github.com/search/users?q=research+computing+in:name+type:org&type=Users&per_page=100&page=%d" %>%
rc.org sprintf(1:2) %>% list.load("json") %>% list.ungroup(level = 3) %>%
names(.) %in% 'login'] %>% as.character
.[
# Scrape Function
# The resulted on n=130 "Research Computing" organizations.
# To waive the GH API rate limit, github_scrape(...) was used
<- function(org_names){
github_scrape
<- paste0("https://github.com/", org_names)
URL <- read_html(URL)
edata <- edata %>%
body_nodes html_node('body') %>%
html_children()
# LOCATION
<- body_nodes %>%
LOCATION xml_find_all("//span[contains(@itemprop, 'location')]") %>%
html_text()
# Organization website
<- body_nodes %>%
WEBSITE xml_find_all("//a[contains(@itemprop, 'url')]") %>%
html_text() %>% tibble::enframe() %>%
filter(!startsWith(value,'@')) %>%
select(value) %>% as.character()
# Organization Name
<- body_nodes %>%
NAME xml_find_all("//h1[contains(@class, 'h2 lh-condensed')]") %>%
html_text() %>%
gsub(pattern = "\\s+",replacement = " ") %>%
::str_trim()
stringr
# Twitter handle
<- body_nodes %>%
TWITTER xml_find_all("//a[contains(@itemprop, 'url')]") %>%
html_text() %>% tibble::enframe() %>%
filter(startsWith(value,'@')) %>%
select(value) %>% as.character()
# Metadata vector
<- c(github=org_names,name=NAME, website=WEBSITE,twitter=TWITTER)
VEC return(VEC)
}
# Do call
<- do.call(rbind,lapply(rc.org, FUN=github_scrape))
rc.metadata
# Load scraped metadata
<- as.data.frame(rc.metadata)
meta.df <- meta.df[order(meta.df$github),]
meta.df
# Add GitHub hyperlinks
<- paste0("https://github.com/", meta.df$github)
ghURL $github <-paste0("<a href='",ghURL,"'",
meta.df' target=\"_blank\">',
$github,"</a>")
meta.df# Remove NA's
== "character(0)"] <- NA
meta.df[meta.df
# Reorder columns
<- subset(meta.df, select=c('github','name','twitter','website'))
meta.df
# Convert to DT
<- DT::datatable(meta.df,escape = F,rownames = F,
meta.dt options = list(pageLength = 100,
autoWidth = TRUE,
fixedColumns = list(leftColumns = 0)))
meta.dt