library(RISmed)
library(parallel)
library(ggplot2)
# Given two lists of terms, lets see how 'hot' they are together
set1 <- c("ebola","autoimmune","Diabetes","HIV","Glioblastoma","Asthma","Schizophrenia")
set2 <- c("C. elegans","D. Melanogaster","C. japonica", "M. Musculus","S. Cerevisiae")
# Generate all possible pairs
pairs <- expand.grid(set1, set2, stringsAsFactors=F)
# Search pubmed for each pair, and return the number of search results.
results <- mclapply(seq(nrow(pairs)), function(x) {
res <- EUtilsSummary(sprintf("%s %s", pairs[x,]$Var1, pairs[x,]$Var2, type='esearch', db='pubmed'))
c(q1=pairs[x,]$Var1, q2=pairs[x,]$Var2, count=QueryCount(res))
})
# Do some data formatting on the results.
results <- as.data.frame(do.call("rbind", results), stringsAsFactors=F)
# Turn the number of search results into numeric form.
results$count <- as.numeric(results$count)
# Plot the results using geom_tile
ggplot(results) +
geom_tile(aes(x=q1, y=q2, fill=count)) +
geom_text(aes(x=q1, y=q2, label=count), color = "white") +
labs(title="Disease Publications by Organism", x="x", y="y")
library(stringr)
library(dplyr)
"""
# Generate concatenated worm_track data using the following
for folder in `ls -d *\/`; do
for file in `ls $folder/worm*`; do
cat $file | awk -v file=$file '{print file","$1}' >> worm_track_all.txt
done;
done;
"""
names(worm_track_all) <- c("Folder","time","x","y","blps","mmps")
worm_track_all[,c("Folder", "Filename")] <- str_split_fixed(worm_track_all$Folder, "///",2)
library("RODBC") #load package
db<-file.path("C:/path/to/your/database.accdb") #connect database.
#Note the UNIX style slash (/). "\" is "escape character" so all "\"you should replace either with "/" or "\\"
channel<-odbcConnectAccess2007(db) #internal RODBC function
dataSetName<-sqlFetch(channel,"TableName") #read particular table from Access database file.
r 这个R脚本将绘制所有你的runkeeper数据。它使用群集分析根据需要按位置对活动进行分组,并输出图表
# Special thanks for insights from flowingdata.com regarding this.
library(plotKML)
library(plyr)
library(dplyr)
library(fpc)
num_locations <- 5
# Usage: Place this script in the directory containing your runkeeper data. You can run from terminal using 'Rscript map_runkeeper.R', or
# set your working directory to the location and run within RStudio (use setwd("~/location/of/runkeeper/data")).
# See below on how to set the number of clusters.
# GPX files downloaded from Runkeeper
files <- dir(pattern = "\\.gpx")
# Generate vectors for data frame
index <- c()
latitude <- c()
longitude <- c()
file <- c()
c <- 1 # Set up Counter
#
for (f in 1:length(files)) {
curr_route <- readGPX(files[f])
# Treat interrupted GPS paths as seperate routes (useful if you occasionally stop running..walk for a bit, and start again like I do.)
for (i in curr_route$tracks[[1]]) {
c <- c + 1
location <- i
file <- c(file,rep(files[f], dim(location)[1]))
index <- c(index, rep(c, dim(location)[1]))
latitude <- c(latitude, location$lat)
longitude <- c(longitude, location$lon)
}
}
routes <- data.frame(cbind(index, latitude, longitude,file))
# Because the routes dataframe takes a while to generate for some folks - save it!
save(routes, file="routes.Rdata")
# Use to load as needed.
load("routes.Rdata")
# Fix data types
routes$file <- as.character(routes$file)
routes$latitude <- as.numeric(levels(routes$latitude)[routes$latitude])
routes$longitude <- as.numeric(levels(routes$longitude)[routes$longitude])
routes <- transform(routes, index = as.numeric(index))
# Load Meta Data
meta_data <- read.csv("cardioActivities.csv", stringsAsFactors=FALSE)
meta_data <- rename(meta_data, c("GPX.File" = "file"))
# Bind routes
routes <- left_join(routes, meta_data, by="file") %.%
arrange(index)
# Use this function specify activity color if you have multiple activities.
activity_color <- function(activity) {
if (activity=="Cycling") {
color = "#00000060"
} else if (activity=="Hiking") {
color = "#00000060"
} else {
color = "#0080ff60"
}
color
}
# Identify clusters of points, which will correspond to locations you have run. For example,
# I have run in Boston, Iowa City, Chicago, and a few other cities. You will want to set the minimum krange
# to the number of cities you have run in (5 in my case).
clusters <- pamk(routes[,c("latitude", "longitude")], krange=num_locations:20, diss=T, usepam=F)$pamobject$medoids
# Plot Everything
for (r in 1:max(row(clusters))) {
print(r)
lat_range <- clusters[r,][1] + rnorm(20, sd=0.1)
lon_range <-clusters[r,][2] + rnorm(20, sd=0.1)
setroutes <- filter(routes, (latitude > min(lat_range) & latitude < max(lat_range)),
longitude > min(lon_range) & longitude < max(lon_range))
routeIds <- unique(setroutes$index)
# Albers projection
locProj <- mapproject(setroutes$longitude, setroutes$latitude, "rectangular", par=38)
setroutes$latproj <- locProj$x
setroutes$lonproj <- locProj$y
# Map the projected points
pdf(sprintf("%s-all.pdf", r))
plot(setroutes$latproj, setroutes$lonproj, type="n", asp=1, axes=FALSE, xlab="", ylab="")
for (i in routeIds) {
currRoute <- subset(setroutes, index==i)
lines(currRoute$latproj, currRoute$lonproj, col=activity_color(currRoute$Type), lwd=0.4)
}
dev.off()
}
# Requirements
#sudo apt-get install libcurl4-gnutls-dev # for RCurl on linux
#install.packages('RCurl')
#install.packages('RJSONIO')
library('RCurl')
library('RJSONIO')
query <- function(querystring) {
h = basicTextGatherer()
curlPerform(url="http://localhost:7474/db/data/cypher",
postfields=paste('query',curlEscape(querystring), sep='='),
writefunction = h$update,
verbose = FALSE
)
result <- fromJSON(h$value())
data <- data.frame(t(sapply(result$data, unlist)))
names(data) <- result$columns
data
}
# EXAMPLE
# =======
# Cypher Query:
q <- "match (o:Organization)-[r]-(p:Person) return o.name,o.location,p.account,p.name,p.email limit 20"
data <-query(q)
head(data,20)
# Output:
# o.name o.location p.account p.name p.email
# 1 PerfectLine Estonia kritik
# 2 Sappho OSS London UK andrewheald
# 3 The 88 NYC aface1
# 4 The 88 NYC xbilldozer
# 5 The 88 NYC chadyj
# 6 The 88 NYC benmanns Benjamin Manns benmanns@gmail.net
# 7 simplabs Munich, Germany marcoow
# 8 Everyday Hero Brisbane, Australia soloman1124
# 9 Everyday Hero Brisbane, Australia orodio
# 10 Everyday Hero Brisbane, Australia justinhennessy
# 11 Everyday Hero Brisbane, Australia coop Tim Cooper coop@latrobest.org
# 12 Everyday Hero Brisbane, Australia evilmarty Marty Zalega marty@zalega.co
# 13 Sorenson Media Salt Lake City, UT & San Diego, CA bcarlson
# 14 Sorenson Media Salt Lake City, UT & San Diego, CA elmomalmo
# 15 Sorenson Media Salt Lake City, UT & San Diego, CA enthooz
# 16 3scale Barcelona, Spain and Sunnyvale, USA solso
# 17 3scale Barcelona, Spain and Sunnyvale, USA MarkCheshire
# 18 3scale Barcelona, Spain and Sunnyvale, USA rhoml
# 19 3scale Barcelona, Spain and Sunnyvale, USA mikz Michal Cichra
# 20 3scale Barcelona, Spain and Sunnyvale, USA njyx
## load the packages
library(XML)
library(RCurl)
library(plyr)
library(ggplot2)
library(reshape2)
library(stringr)
## the page
URL = "http://www.hockey-reference.com/leagues/NHL_2014_standings.html"
## read the raw page
team_page = getURL(URL)
team_page = htmlParse(team_page)
## parse out the links
links = xpathSApply(team_page, '//a/@href')
names(links) = NULL
team_links = links[str_detect(links, ".*/2014.html")]
team_links = unique(team_links)
## for each link, grab the page, grab the skater stats, save to master dataset
skaters = data.frame(stringsAsFactors=F)
for (TEAM in team_links) {
# build the page
URL = paste0("http://www.hockey-reference.com", TEAM)
# grab the page
tm_page = readHTMLTable(URL, stringsAsFactors=F)
# grab the skater stats
tmp_skaters = tm_page$skaters
# fix a couple of the names
names(tmp_skaters)[10] = "plusminus"
names(tmp_skaters)[20] = "shotpct"
# add the team
team = str_extract(TEAM, "[A-Z]{3}")
tmp_skaters$team = team
# merge the data on the skaters tame frame
skaters = rbind.fill(skaters, tmp_skaters)
# status
cat("finished ", TEAM, "\n")
}
## how many skaters does each team have?
ddply(skaters, .(team), summarise, num_players = length(Player))
ggplot(skaters, aes(factor(team))) +
geom_bar() +
theme_bw() +
labs(title="Title", x="Team", y="")
## classify the goal scorer types
skaters$G = as.numeric(skaters$G)
summary(skaters$G)
skaters$skater_type = cut(skaters$G,
breaks=c(0, 5, 10, 20, 30, 40, 50, 60),
include.lowest = T,
right = F)
## team distributions
tbl_dist = with(skaters, table(team, skater_type))
round(prop.table(tbl_dist, 1), 2) ## row distributions
## metric = split on 20 + (or a better number)
## scatterplot (x = percentage, y = volume)
BELOW HERE IS TEMP
## grab the data
## http://goo.gl/0ZurK
# tables = readHTMLTable(URL, stringsAsFactors=F)
# length(tables)
# names(tables)
# head(tables$standings)
## XML package FTW!
## bring the data into a dataframe
nhl_14 = tables$stats
colnames(nhl_14) = tolower(colnames(nhl_14))
## change the rank column and remove the row breaks
nhl_14$rk = as.numeric(nhl_14$rk)
nhl_14 = subset(nhl_14, !is.na(rk))
nhl_14 = subset(nhl_14, tm != 'Tm')
STOP = Vanek is rolled up. Need to crawl team x team and summarize that way
## lets look at the goals column to get a sense of the data
nhl_14$g = as.numeric(nhl_14$g)
summary(nhl_14$g)
## cut the goals variable into groups
nhl_14$break5 = cut(nhl_14$g,
breaks = seq(0, 60, 5),
include.lowest=T,
right=F)
nhl_14$break10 = cut(nhl_14$g,
breaks = seq(0, 60, 10),
include.lowest=T,
right=F)
nhl_14$core_breaks = cut(nhl_14$g,
breaks = c(0, 10, 20, 25, 30, 40, 50, 60),
include.lowest=T,
right=T)
need to isolate players if they have multiple stints
make single record by selecting max stint number but sumamrize for player
## quick distribution of the types
ggplot(nhl_14, aes(core_breaks)) + geom_histogram()
## summarize by team
by_team ddply(nhl_14, .(tm, core_breaks), )