r 发布成对搜索的示例

发布成对搜索的示例

pubmed pairwise.R
library(RISmed)
library(parallel)
library(ggplot2)

# Given two lists of terms, lets see how 'hot' they are together
set1 <- c("ebola","autoimmune","Diabetes","HIV","Glioblastoma","Asthma","Schizophrenia")
set2 <- c("C. elegans","D. Melanogaster","C. japonica", "M. Musculus","S. Cerevisiae")

# Generate all possible pairs
pairs <- expand.grid(set1, set2, stringsAsFactors=F)

# Search pubmed for each pair, and return the number of search results.
results <- mclapply(seq(nrow(pairs)),  function(x) {
res <- EUtilsSummary(sprintf("%s %s", pairs[x,]$Var1, pairs[x,]$Var2, type='esearch', db='pubmed'))
c(q1=pairs[x,]$Var1, q2=pairs[x,]$Var2, count=QueryCount(res))
})

# Do some data formatting on the results.
results <- as.data.frame(do.call("rbind", results), stringsAsFactors=F)
# Turn the number of search results into numeric form.
results$count <- as.numeric(results$count)

# Plot the results using geom_tile
ggplot(results) +
	geom_tile(aes(x=q1, y=q2, fill=count)) +
	geom_text(aes(x=q1, y=q2, label=count), color = "white") + 
	labs(title="Disease Publications by Organism", x="x", y="y")

r R麻烦

R麻烦

R trouble.r
Problem solved. Faulty code in line 16: geom_line(aes(y = monthly$CumOil, colour = "CumOil, 1000 m3"))

R version 3.1.0 (2014-04-10) -- "Spring Dance"
Copyright (C) 2014 The R Foundation for Statistical Computing
Platform: i386-w64-mingw32/i386 (32-bit)

Code:

library("ggplot2")
ggplot(monthly[monthly$Well_N == "NSD1",],aes(Date_m)) +
  geom_line(aes(y = QOM, colour = "Oil, m3/month")) +
  geom_point(aes(y = QOM, colour = "Oil, m3/month")) +
  geom_line(aes(y = QWM, colour = "Water, m3/month")) +
  geom_line(aes(y = QFM, colour = "Liquid, m3/month")) +
  geom_line(aes(y = WCT*10, colour = "WCT (1000=100%)")) +
  geom_line(aes(y = monthly$CumOil, colour = "CumOil, 1000 m3"))

Output:

> library("ggplot2")
> ggplot(monthly[monthly$Well_N == "NSD1",],aes(Date_m)) +
+   geom_line(aes(y = QOM, colour = "Oil, m3/month")) +
+   geom_point(aes(y = QOM, colour = "Oil, m3/month")) +
+   geom_line(aes(y = QWM, colour = "Water, m3/month")) +
+   geom_line(aes(y = QFM, colour = "Liquid, m3/month")) +
+   geom_line(aes(y = WCT*10, colour = "WCT (1000=100%)")) +
+   geom_line(aes(y = monthly$CumOil, colour = "CumOil, 1000 m3"))
Error: geom_line requires the following missing aesthetics: y

Data:

str(monthly)
'data.frame':	6150 obs. of  11 variables:
 $ Well_N: Factor w/ 93 levels "ABL2","ABL7",..: 21 21 21 21 21 17 21 17 21 17 ...
 $ Date_m: POSIXct, format: "2013-08-31" "2013-09-30" "2013-10-31" ...
 $ QOM   : num  0 0 0 0 0 0 0 0 0 0 ...
 $ QWM   : num  0 0 0 0 0 0 0 0 0 0 ...
 $ QOMT  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ BHP   : num  NA NA NA NA NA NA NA NA NA NA ...
 $ PRES  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ QIW   : num  12958 11920 12695 12461 13085 ...
 $ THPI  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ QFM   : num  0 0 0 0 0 0 0 0 0 0 ...
 $ WCT   : num  0 0 0 0 0 0 0 0 0 0 ...

r 与包含的bash一起,连接文件夹中的多个文件(使用foldername和filename)

与包含的bash一起,连接文件夹中的多个文件(使用foldername和filename)

worm_tracker.R
library(stringr)
library(dplyr)
"""
# Generate concatenated worm_track data using the following
for folder in `ls -d *\/`; do
    for file in `ls $folder/worm*`; do
        cat $file | awk -v file=$file '{print file","$1}' >> worm_track_all.txt
    done;
done;
"""

names(worm_track_all) <- c("Folder","time","x","y","blps","mmps")

worm_track_all[,c("Folder", "Filename")] <- str_split_fixed(worm_track_all$Folder, "///",2)

r 将MS Access数据库连接到Windows中的R(RStudio)

将MS Access数据库连接到Windows中的R(RStudio)

MSAccessDBConnection.r
library("RODBC") #load package
db<-file.path("C:/path/to/your/database.accdb") #connect database.
#Note the UNIX style slash (/). "\" is "escape character" so all "\"you should replace either with "/" or "\\"
channel<-odbcConnectAccess2007(db) #internal RODBC function
dataSetName<-sqlFetch(channel,"TableName") #read particular table from Access database file.

r 这个R脚本将绘制所有你的runkeeper数据。它使用群集分析根据需要按位置对活动进行分组,并输出图表

这个R脚本将绘制所有你的runkeeper数据。它使用群集分析根据需要按位置对活动进行分组,并为每个位置输出图表。例如 - 我在爱荷华市,波士顿和芝加哥经营 - 这个脚本能够分别识别这些位置和输出。

plot_runkeeper.R
# Special thanks for insights from flowingdata.com regarding this.

library(plotKML)
library(plyr)
library(dplyr)
library(fpc)

num_locations <- 5

# Usage: Place this script in the directory containing your runkeeper data. You can run from terminal using 'Rscript map_runkeeper.R', or
# set your working directory to the location and run within RStudio (use setwd("~/location/of/runkeeper/data")).
# See below on how to set the number of clusters.

# GPX files downloaded from Runkeeper
files <- dir(pattern = "\\.gpx")

# Generate vectors for data frame
index <- c()
latitude <- c()
longitude <- c()
file <- c()

c <- 1 # Set up Counter

# 
for (f in 1:length(files)) {
  curr_route <- readGPX(files[f])
# Treat interrupted GPS paths as seperate routes (useful if you occasionally stop running..walk for a bit, and start again like I do.)
for (i in curr_route$tracks[[1]]) {
  c <- c + 1
  location <- i
  file <- c(file,rep(files[f], dim(location)[1])) 
  index <- c(index, rep(c, dim(location)[1]))
  latitude <- c(latitude, location$lat)
  longitude <- c(longitude, location$lon)
}
}
routes <- data.frame(cbind(index, latitude, longitude,file))

# Because the routes dataframe takes a while to generate for some folks - save it!
save(routes, file="routes.Rdata")
# Use to load as needed.
load("routes.Rdata")

# Fix data types
routes$file <- as.character(routes$file)
routes$latitude <- as.numeric(levels(routes$latitude)[routes$latitude])
routes$longitude <- as.numeric(levels(routes$longitude)[routes$longitude])
routes <- transform(routes, index = as.numeric(index))

# Load Meta Data
meta_data <- read.csv("cardioActivities.csv", stringsAsFactors=FALSE)
meta_data <- rename(meta_data, c("GPX.File" = "file"))

# Bind routes
routes <- left_join(routes, meta_data, by="file") %.%
  arrange(index)


# Use this function specify activity color if you have multiple activities.
activity_color <- function(activity) {
  if (activity=="Cycling") {
    color = "#00000060"
  } else if (activity=="Hiking") {
    color = "#00000060"
  } else {
    color = "#0080ff60"
  }
  color
}

# Identify clusters of points, which will correspond to locations you have run. For example,
# I have run in Boston, Iowa City, Chicago, and a few other cities. You will want to set the minimum krange
# to the number of cities you have run in (5 in my case).
clusters <- pamk(routes[,c("latitude", "longitude")], krange=num_locations:20, diss=T, usepam=F)$pamobject$medoids

# Plot Everything
for (r in 1:max(row(clusters))) {
  print(r)
  lat_range <- clusters[r,][1] + rnorm(20, sd=0.1)
  lon_range <-clusters[r,][2] + rnorm(20, sd=0.1)
  setroutes <- filter(routes, (latitude > min(lat_range) & latitude < max(lat_range)),
                      longitude > min(lon_range) &  longitude < max(lon_range))
  
  routeIds <- unique(setroutes$index)
  
  # Albers projection
  locProj <- mapproject(setroutes$longitude, setroutes$latitude, "rectangular", par=38)
  setroutes$latproj <- locProj$x
  setroutes$lonproj <- locProj$y
  
  
  # Map the projected points
  pdf(sprintf("%s-all.pdf", r))
  
  plot(setroutes$latproj, setroutes$lonproj, type="n", asp=1, axes=FALSE, xlab="", ylab="")
  for (i in routeIds) {
    currRoute <- subset(setroutes, index==i)
    lines(currRoute$latproj, currRoute$lonproj, col=activity_color(currRoute$Type), lwd=0.4)
  }
  dev.off()
}

r 直接从Neo4j REST API获取图形数据到R.对于R igraph用户有用。

直接从Neo4j REST API获取图形数据到R.对于R igraph用户有用。

neo4R_example.R
# Requirements
#sudo apt-get install libcurl4-gnutls-dev # for RCurl on linux
#install.packages('RCurl')
#install.packages('RJSONIO')

library('RCurl')
library('RJSONIO')
 
query <- function(querystring) {
  h = basicTextGatherer()
  curlPerform(url="http://localhost:7474/db/data/cypher",
    postfields=paste('query',curlEscape(querystring), sep='='),
    writefunction = h$update,
    verbose = FALSE
  )
  
  result <- fromJSON(h$value())
  
  data <- data.frame(t(sapply(result$data, unlist)))
  names(data) <- result$columns
  
  data 
}
 

# EXAMPLE
# =======

# Cypher Query:
q <- "match (o:Organization)-[r]-(p:Person) return o.name,o.location,p.account,p.name,p.email limit 20"

data <-query(q)

head(data,20)

# Output:
#           o.name                          o.location      p.account         p.name            p.email
# 1     PerfectLine                             Estonia         kritik                                  
# 2      Sappho OSS                           London UK    andrewheald                                  
# 3          The 88                                 NYC         aface1                                  
# 4          The 88                                 NYC     xbilldozer                                  
# 5          The 88                                 NYC         chadyj                                  
# 6          The 88                                 NYC       benmanns Benjamin Manns benmanns@gmail.net
# 7        simplabs                     Munich, Germany        marcoow                                  
# 8   Everyday Hero                 Brisbane, Australia    soloman1124                                  
# 9   Everyday Hero                 Brisbane, Australia         orodio                                  
# 10  Everyday Hero                 Brisbane, Australia justinhennessy                                  
# 11  Everyday Hero                 Brisbane, Australia           coop     Tim Cooper coop@latrobest.org
# 12  Everyday Hero                 Brisbane, Australia      evilmarty   Marty Zalega    marty@zalega.co
# 13 Sorenson Media  Salt Lake City, UT & San Diego, CA       bcarlson                                  
# 14 Sorenson Media  Salt Lake City, UT & San Diego, CA      elmomalmo                                  
# 15 Sorenson Media  Salt Lake City, UT & San Diego, CA        enthooz                                  
# 16         3scale Barcelona, Spain and Sunnyvale, USA          solso                                  
# 17         3scale Barcelona, Spain and Sunnyvale, USA   MarkCheshire                                  
# 18         3scale Barcelona, Spain and Sunnyvale, USA          rhoml                                  
# 19         3scale Barcelona, Spain and Sunnyvale, USA           mikz  Michal Cichra                   
# 20         3scale Barcelona, Spain and Sunnyvale, USA           njyx                                  

r wordcloud_test.r

wordcloud_test.r
## ref: http://blog.ouseful.info/2012/02/15/generating-twitter-wordclouds-in-r-prompted-by-an-open-learning-blogpost/
 
require(wordcloud)
 
## generate a png image
png('test.png',width=600,height=600)
dev.off()

r 使用ggplot绘制Kaplan-Meier曲线。 ggkmTable.R在图表下方添加了一个表格,显示了不同时间的风险数字。

使用ggplot绘制Kaplan-Meier曲线。 ggkmTable.R在图表下方添加了一个表格,显示了不同时间的风险数字。

ggkmTable.R
#’ Create a Kaplan-Meier plot using ggplot2
#’
#’ @param sfit a \code{\link[survival]{survfit}} object
#’ @param table logical: Create a table graphic below the K-M plot, indicating at-risk numbers?
#’ @param returns logical: if \code{TRUE}, return an arrangeGrob object
#’ @param xlabs x-axis label
#’ @param ylabs y-axis label
#’ @param ystratalabs The strata labels. \code{Default = levels(summary(sfit)$strata)}
#’ @param ystrataname The legend name. Default = “Strata”
#’ @param timeby numeric: control the granularity along the time-axis
#’ @param main plot title
#’ @param pval logical: add the pvalue to the plot?
#’ @return a ggplot is made. if return=TRUE, then an arrangeGlob object
#’ is returned
#’ @author Abhijit Dasgupta with contributions by Gil Tomas
#’ \url{http://statbandit.wordpress.com/2011/03/08/an-enhanced-kaplan-meier-plot/}
#’ @export
#’ @examples
#’ \dontrun{
#’ data(colon)
#’  fit <- survfit(Surv(time,status)~rx, data=colon)
#'  ggkm(fit, timeby=500)
#' }
ggkmTable <- function(sfit, table=TRUE,returns = FALSE,
xlabs = "Time", ylabs = "survival probability",
ystratalabs = NULL, ystrataname = NULL,
timeby = 100, main = "Kaplan-Meier Plot",
pval = TRUE, ...) {
require(plyr)
require(ggplot2)
require(survival)
require(gridExtra)
if(is.null(ystratalabs)) {
   ystratalabs <- as.character(levels(summary(sfit)$strata))
}
m <- max(nchar(ystratalabs))
if(is.null(ystrataname)) ystrataname <- "Strata"
times <- seq(0, max(sfit$time), by = timeby)
.df <- data.frame(time = sfit$time, n.risk = sfit$n.risk,
    n.event = sfit$n.event, surv = sfit$surv, strata = summary(sfit, censored = T)$strata,
    upper = sfit$upper, lower = sfit$lower)
levels(.df$strata) <- ystratalabs
zeros <- data.frame(time = 0, surv = 1, strata = factor(ystratalabs, levels=levels(.df$strata)),
    upper = 1, lower = 1)
.df <- rbind.fill(zeros, .df)
d <- length(levels(.df$strata))
p <- ggplot(.df, aes(time, surv, group = strata)) +
    geom_step(aes(linetype = strata), size = 0.7) +
    theme_bw() +
    theme(axis.title.x = element_text(vjust = 0.5)) +
    scale_x_continuous(xlabs, breaks = times, limits = c(0, max(sfit$time))) +
    scale_y_continuous(ylabs, limits = c(0, 1)) +
    theme(panel.grid.minor = element_blank()) +
    theme(legend.position = c(ifelse(m < 10, .28, .35), ifelse(d < 4, .25, .35))) +
    theme(legend.key = element_rect(colour = NA)) +
    labs(linetype = ystrataname) +
    theme(plot.margin = unit(c(0, 1, .5, ifelse(m < 10, 1.5, 2.5)), "lines")) +
    ggtitle(main)
 
if(pval) {
    sdiff <- survdiff(eval(sfit$call$formula), data = eval(sfit$call$data))
    pval <- pchisq(sdiff$chisq, length(sdiff$n)-1, lower.tail = FALSE)
    pvaltxt <- ifelse(pval < 0.0001, "p < 0.0001", paste("p =", signif(pval, 3)))
    p <- p + annotate("text", x = 0.6 * max(sfit$time), y = 0.1, label = pvaltxt)
}

## Create a blank plot for place-holding
## .df <- data.frame()
blank.pic <- ggplot(.df, aes(time, surv)) +
    geom_blank() +
    theme_bw() +
    theme(axis.text.x = element_blank(), axis.text.y = element_blank(),
        axis.title.x = element_blank(), axis.title.y = element_blank(),
        axis.ticks = element_blank(), panel.grid.major = element_blank(),
        panel.border = element_blank())
if(table) {
    ## Create table graphic to include at-risk numbers
    risk.data <- data.frame(strata = summary(sfit, times = times, extend = TRUE)$strata,
        time = summary(sfit, times = times, extend = TRUE)$time,
        n.risk = summary(sfit, times = times, extend = TRUE)$n.risk)
    data.table <- ggplot(risk.data, aes(x = time, y = strata, label = format(n.risk, nsmall = 0))) +
        #, color = strata)) +
        geom_text(size = 3.5) +
        theme_bw() +
        scale_y_discrete(breaks = as.character(levels(risk.data$strata)), labels = ystratalabs) +
        # scale_y_discrete(#format1ter = abbreviate,
        # breaks = 1:3,
        # labels = ystratalabs) +
        scale_x_continuous("Numbers at risk", limits = c(0, max(sfit$time))) +
        theme(axis.title.x = theme_text(size = 10, vjust = 1), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), panel.border = element_blank(),
        axis.text.x = element_blank(), axis.ticks = element_blank(),
        axis.text.y = element_text(face = "bold", hjust = 1))
    data.table <- data.table + theme(legend.position = "none") +
        xlab(NULL) + ylab(NULL)
    data.table <- data.table +
        theme(plot.margin = unit(c(-1.5, 1, 0.1, ifelse(m < 10, 2.5, 3.5)-0.28 * m), "lines"))
## Plotting the graphs
## p <- ggplotGrob(p)
## p <- addGrob(p, textGrob(x = unit(.8, "npc"), y = unit(.25, "npc"), label = pvaltxt,
    ## gp = gpar(fontsize = 12)))
    grid.arrange(p, blank.pic, data.table,
        clip = FALSE, nrow = 3, ncol = 1,
        heights = unit(c(2, .1, .25),c("null", "null", "null")))
    if(returns) {
        a <- arrangeGrob(p, blank.pic, data.table, clip = FALSE,
            nrow = 3, ncol = 1, heights = unit(c(2, .1, .25),c("null", "null", "null")))
        return(a)
    }
}
else {
    ## p <- ggplotGrob(p)
    ## p <- addGrob(p, textGrob(x = unit(0.5, "npc"), y = unit(0.23, "npc"),
    ## label = pvaltxt, gp = gpar(fontsize = 12)))
    print(p)
    if(returns) return(p)
    }
}
ggkm.R
#’ Create a Kaplan-Meier plot using ggplot2
#’
#’ @param sfit a \code{\link[survival]{survfit}} object
#’ @param returns logical: if \code{TRUE}, return an ggplot object
#’ @param xlabs x-axis label
#’ @param ylabs y-axis label
#’ @param ystratalabs The strata labels. \code{Default = levels(summary(sfit)$strata)}
#’ @param ystrataname The legend name. Default = “Strata”
#’ @param timeby numeric: control the granularity along the time-axis
#’ @param main plot title
#’ @param pval logical: add the pvalue to the plot?
#’ @return a ggplot is made. if returns=TRUE, then an ggplot object
#’ is returned
#’ @author Abhijit Dasgupta with contributions by Gil Tomas
#’ \url{http://statbandit.wordpress.com/2011/03/08/an-enhanced-kaplan-meier-plot/}
#’ @export
#’ @examples
#’ \dontrun{
#’ data(colon)
#’  fit <- survfit(Surv(time,status)~rx, data=colon)
#'  ggkm(fit, timeby=500)
#' }
ggkm <- function(sfit, returns = FALSE,
xlabs = "Time", ylabs = "survival probability",
ystratalabs = NULL, ystrataname = NULL,
timeby = 100, main = "Kaplan-Meier Plot",
pval = TRUE, ...) {
require(plyr)
require(ggplot2)
require(survival)
require(gridExtra)
if(is.null(ystratalabs)) {
   ystratalabs <- as.character(levels(summary(sfit)$strata))
}
m <- max(nchar(ystratalabs))
if(is.null(ystrataname)) ystrataname <- "Strata"
times <- seq(0, max(sfit$time), by = timeby)
.df <- data.frame(time = sfit$time, n.risk = sfit$n.risk,
    n.event = sfit$n.event, surv = sfit$surv, strata = summary(sfit, censored = T)$strata,
    upper = sfit$upper, lower = sfit$lower)
levels(.df$strata) <- ystratalabs
zeros <- data.frame(time = 0, surv = 1, strata = factor(ystratalabs, levels=levels(.df$strata)),
    upper = 1, lower = 1)
.df <- rbind.fill(zeros, .df)
d <- length(levels(.df$strata))
p <- ggplot(.df, aes(time, surv, group = strata)) +
    geom_step(aes(linetype = strata), size = 0.7) +
    theme_bw() +
    theme(axis.title.x = element_text(vjust = 0.5)) +
    scale_x_continuous(xlabs, breaks = times, limits = c(0, max(sfit$time))) +
    scale_y_continuous(ylabs, limits = c(0, 1)) +
    theme(panel.grid.minor = element_blank()) +
    theme(legend.position = c(ifelse(m < 10, .28, .35), ifelse(d < 4, .25, .35))) +
    theme(legend.key = element_rect(colour = NA)) +
    labs(linetype = ystrataname) +
    theme(plot.margin = unit(c(0, 1, .5, ifelse(m < 10, 1.5, 2.5)), "lines")) +
    ggtitle(main)
 
if(pval) {
    sdiff <- survdiff(eval(sfit$call$formula), data = eval(sfit$call$data))
    pval <- pchisq(sdiff$chisq, length(sdiff$n)-1, lower.tail = FALSE)
    pvaltxt <- ifelse(pval < 0.0001, "p < 0.0001", paste("p =", signif(pval, 3)))
    p <- p + annotate("text", x = 0.6 * max(sfit$time), y = 0.1, label = pvaltxt)
}
## Plotting the graphs
    print(p)
    if(returns) return(p)
   
}

r Collegeboard搜索文件列标题

Collegeboard搜索文件列标题

cboard-hearders.r
c("order_no", "run_no", "student_id", "last_name", "first_name", 
"mi", "street1", "street2", "street3", "city", "state", "zip", 
"country", "county_code", "post_del", "post_corr", "email", "dob", 
"gender", "ethnicity", "grad_year", "hs_code", "geomarket", "tbd", 
"major1", "major2", "major3", "major4", "major5", "ap1", "ap2", 
"ap3", "ap4", "ap5", "ap6", "ap7", "ap8", "ap9", "ap10", "satsub1", 
"satsub2", "satsub3", "satsub4", "satsub5", "satsub6", "satsub7", 
"satsub8", "satsub9", "satsub10", "name_source", "update_date", 
"homeschool", "low_ses", "hs_cluster", "en_cluster", "TBD_1", 
"TBD_2", "TBD_3", "TBD_4", "TBD_5", "TBD_6", "TBD_7", "TBD_8", 
"TBD_9", "TBD_10", "TBD_11", "TBD_12", "TBD_13", "TBD_14", "TBD_15", 
"TBD_16", "TBD_17", "TBD_18", "TBD_19")

r 2013/14赛季的团队分配目标

2013/14赛季的团队分配目标

nhl-goal-distribution.r
## load the packages
library(XML)
library(RCurl)
library(plyr)
library(ggplot2)
library(reshape2)
library(stringr)

## the page
URL = "http://www.hockey-reference.com/leagues/NHL_2014_standings.html"

## read the raw page
team_page = getURL(URL)
team_page = htmlParse(team_page)

## parse out the links
links = xpathSApply(team_page, '//a/@href')
names(links) = NULL
team_links = links[str_detect(links, ".*/2014.html")]
team_links = unique(team_links)

## for each link, grab the page, grab the skater stats, save to master dataset
skaters = data.frame(stringsAsFactors=F)
for (TEAM in team_links) {
  # build the page
  URL = paste0("http://www.hockey-reference.com", TEAM)
  # grab the page
  tm_page = readHTMLTable(URL, stringsAsFactors=F)
  # grab the skater stats
  tmp_skaters = tm_page$skaters
  # fix a couple of the names
  names(tmp_skaters)[10] = "plusminus"
  names(tmp_skaters)[20] = "shotpct"
  # add the team
  team = str_extract(TEAM, "[A-Z]{3}")
  tmp_skaters$team = team
  # merge the data on the skaters tame frame
  skaters = rbind.fill(skaters, tmp_skaters)
  # status
  cat("finished ", TEAM, "\n")
}

## how many skaters does each team have?
ddply(skaters, .(team), summarise, num_players = length(Player))
ggplot(skaters, aes(factor(team))) + 
  geom_bar() + 
  theme_bw() + 
  labs(title="Title", x="Team", y="")


## classify the goal scorer types
skaters$G = as.numeric(skaters$G)
summary(skaters$G)
skaters$skater_type = cut(skaters$G, 
                          breaks=c(0, 5, 10, 20, 30, 40, 50, 60),
                          include.lowest = T,
                          right = F)


## team distributions
tbl_dist = with(skaters, table(team, skater_type))
round(prop.table(tbl_dist, 1), 2) ## row distributions

## metric =  split on 20 + (or a better number)
## scatterplot (x = percentage, y = volume)


BELOW HERE IS TEMP

## grab the data
## http://goo.gl/0ZurK
# tables = readHTMLTable(URL, stringsAsFactors=F)
# length(tables)
# names(tables)
# head(tables$standings)
## XML package FTW!

## bring the data into a dataframe
nhl_14 = tables$stats
colnames(nhl_14) = tolower(colnames(nhl_14))

## change the rank column and remove the row breaks
nhl_14$rk = as.numeric(nhl_14$rk)
nhl_14 = subset(nhl_14, !is.na(rk))
nhl_14 = subset(nhl_14, tm != 'Tm')

STOP = Vanek is rolled up.  Need to crawl team x team and summarize that way

## lets look at the goals column to get a sense of the data
nhl_14$g = as.numeric(nhl_14$g)
summary(nhl_14$g)

## cut the goals variable into groups
nhl_14$break5 = cut(nhl_14$g,
                    breaks = seq(0, 60, 5), 
                    include.lowest=T, 
                    right=F)
nhl_14$break10 = cut(nhl_14$g,
                     breaks = seq(0, 60, 10), 
                     include.lowest=T, 
                     right=F)
nhl_14$core_breaks = cut(nhl_14$g,
                         breaks = c(0, 10, 20, 25, 30, 40, 50, 60), 
                         include.lowest=T, 
                         right=T)


need to isolate players if they have multiple stints
make single record by selecting max stint number but sumamrize for player

## quick distribution of the types
ggplot(nhl_14, aes(core_breaks)) + geom_histogram()


## summarize by team
by_team  ddply(nhl_14, .(tm, core_breaks), )