r 一个沟道agilent.R

scrub.sh
rm -f data/scrubbed/*.txt
# the agilent files have extra headers. we just
# want the part of the file that starts with "FEATURES"
# remove the extra headers and split files
# into sets where the name reflects the array used.
for f in data/{LT,NJ,mi}*.txt; do 
    rm -f a.tmp
    awk 'BEGIN { seen=0;FS=OFS="\t" }
        { if($1=="FEATURES"){ seen=1 }
        if(seen==1){ print $0; }}' $f > a.tmp
    nlines=$(wc -l "a.tmp" | awk '{ print $1 }')
    n=$(($nlines - 1))
    mv a.tmp data/scrubbed/`basename $f .txt`.${n}.txt
    echo $f $n
done


# for each chip used, create a target file.
for N in 45015 62976; do
    T=data/scrubbed/${N}.targets.txt
    echo $'FileName\tCondition' > $T
    for f in data/scrubbed/*${N}.txt; do

        grp=$(echo $f | perl -pe 's/^.+_(\w+)\.\d+.*/$1/;s/IDL/ILD/;s/.+CTRL/CTRL/;')
        echo "$f    $grp" >> $T
    done
done
one-channel-agilent.R
library(limma)

GROUP="62976"
# targets.txt has columns of "FileName" and "Condition" e.g.
"""
FileName    Condition
data/scrubbed/LT001098RU_COPD.45015.txt COPD
data/scrubbed/LT001600RL_ILD.45015.txt  ILD
data/scrubbed/LT003990RU_CTRL.45015.txt CTRL
data/scrubbed/LT004173LL_ILD.45015.txt  ILD
data/scrubbed/LT007392RU_COPD.45015.txt COPD
"""
targets = readTargets(paste("data/scrubbed/", GROUP, ".targets.txt", sep=""))
dat = read.maimages(files=targets$FileName, path=".", source="agilent.median", green.only=T,
    columns=list(G="gMedianSignal", Gb="gBGMedianSignal"), 
    annotation=c("Row", "Col", "ProbeName", "SystematicName")
)

# https://stat.ethz.ch/pipermail/bioconductor/attachments/20101217/1c82279c/attachment.pl
# https://stat.ethz.ch/pipermail/bioconductor/2010-December/037051.html
# http://matticklab.com/index.php?title=Single_channel_analysis_of_Agilent_microarray_data_with_Limma

# http://tolstoy.newcastle.edu.au/R/e15/help/11/08/3339.html

# E contains the values for single-channel analyses
dat <- backgroundCorrect(dat, method="normexp", offset=1)
dat$E <- normalizeBetweenArrays(dat$E, method="quantile")
dat$E <- log2(dat$E)

E = new("MAList", list(targets=dat$targets, genes=dat$genes, source=dat$source, M=dat$E, A=dat$E))
E.avg <- avereps(E, ID=E$genes$ProbeName)


# make the design matrix
d.levels = unique(targets$Condition)
d.factor = factor(targets$Condition, levels=d.levels)
d.design = model.matrix(~0 + d.factor)
colnames(d.design) = levels(d.factor)

fit = lmFit(E.avg$A, d.design)

# customize comparisons here.
contrast.matrix <- makeContrasts("COPD-ILD", "COPD-CTRL", "ILD-CTRL", levels=d.design)

fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)


for(name in colnames(contrast.matrix)){
    output = topTable(fit2, adjust="BH", coef=name, genelist=E.avg$genes, number=90000)
    write.table(output, file=paste(GROUP, "-", name, ".txt", sep=""), sep="\t", quote=FALSE, row.names=F)
}

r gistfile1.r

gistfile1.r

## ----, include=FALSE-----------------------------------------------------
library(knitr)


## ------------------------------------------------------------------------
source("pre_azw.R")


## ------------------------------------------------------------------------
languages = read.csv("languages.csv", header = FALSE, stringsAsFactors = FALSE)[[1]]


## ------------------------------------------------------------------------
lang_regex <- paste(languages, collapse="|") %>% paste("(", ., ")", sep="")
lang_level_regex <- paste("(Elementary|Intermediate|Advanced)\\s+", lang_regex, sep="")
lang_in_title <- grepl(lang_regex, courses$course_title)
lang_level_in_title <- with(courses, grepl(lang_level_regex, course_title) | grepl(lang_level_regex, sub_title))
has_lang_code <- grepl("^\\w+ [A-C][a-z]*\\.", courses$course_title, perl=TRUE)
lang_in_desc <- grepl("language course", courses$course_description, ignore.case = TRUE)


## ------------------------------------------------------------------------
matched <- courses %>% filter(lang_in_desc | has_lang_code | lang_level_in_title)
unmatched <- courses %>% filter(lang_in_title) %>% anti_join(matched)



r 用于UNITID,CEEB代码,OPEID等的PESC Crosswalk数据库。该数据库很旧,但可以在http://www.pesc.org/interior.php?p找到

用于UNITID,CEEB代码,OPEID等的PESC Crosswalk数据库。该数据库很旧,但可以在http://www.pesc.org/interior.php?page_id=145找到

pesc-crosswalk.R
## connect to the dbf file included with the Crosswalker tool
## http://www.pesc.org/interior.php?page_id=145
DIR = "C:/Program Files/t.h.ink Software/Crosswalker"
FILE = "CODEMAST.DBF"
library(foreign)
db = read.dbf(file.path(DIR, FILE), as.is = T)
colnames(db)
subset(db, IPEDSUNIT==164739)

r 导入荷马ChIP-Seq Motif数据

导入荷马ChIP-Seq Motif数据

import_homer_ChIP_Seq_motif_data.R
# Daniel Cook 2014
# Danielecook.com
#
# Use this function to import ChIP Seq Data generated by Homer. This data is generated using homers findMotifsGenome.pl
# command with the '-find <motif file>' argument. Generate output
# looks like this:
#
# 1. Peak/Region ID
# 2. Chromosome
# 3. Start
# 4. End
# 5. Strand of Peaks
# 6-18: annotation information
# 19. CpG%
# 20. GC%
# 21. Motif Instances <distance from center of region>(<sequence>,<strand>,<conservation>)
#
# Getting the information from column 21 for plotting, for example, frequency can be tricky. This funciton aims to help that
# by converting the data format from wide to long. Currently, annotation columns are ignored but you can modify the function to
# fix.

library(stringr)
library(splitstackshape)
library(reshape2)
library(dplyr)
suppressMessages(library(data.table))

import_peak_file <- function(filename, peak_type) {
  df <- fread(filename)
  setnames(df, names(df)[1],"Peak")
  setnames(df, names(df)[9:length(names(df))] , str_extract(names(df)[9:length(names(df))], "[A-Za-z0-9]+"))
  motifs <- names(df)[9:length(names(df))]
  # Reshape latter columns

  # Remove extra columns if present
  df$Annotation <- NULL
  df$"Focus Ratio/Region Size" <- NULL
  df$Detailed <- NULL
  df$Distance <- NULL
  df$Nearest <- NULL
  df$Entrez <- NULL
  df$Nearest <- NULL
  df$Nearest <- NULL
  df$Nearest <- NULL
  df$Gene <- NULL
  df$Gene <- NULL
  df$Gene <- NULL
  df$Gene <- NULL
  
  # Melt Again
  df <- melt(df, id.vars = 1:8)
  df$value <- str_replace_all(df$value, "),", "|")
  df <- filter(df, value != "")
  df <- concat.split(as.data.frame(df), split.col = c("value"), sep="|")
  df$value <- NULL
  
  df <- rename(df, c("variable"="motif_name"))
  df <- melt(df, id.vars = 1:9)
  df$value_1 <- NULL
  df$variable <- NULL
  df <- filter(df, value != "")
  
  df$MOTIF <- str_extract(df$value,"[ATCG]+")
  df$POS <- as.integer(str_extract(df$value,"[-0-9]+"))
  df$STRAND <- str_extract(df$value,"(,[+|-],)")
  df$STRAND <- str_replace_all(df$STRAND,",","")
  df$value <- NULL
  df <- filter(df, !is.na(POS))
  df$peak_type <- peak_type
  df
}

r 按需安装R包(如果需要)

按需安装R包(如果需要)

gistfile1.r
if (!require("data.table")) {
install.packages("data.table")
}
if (!require("reshape2")) {
install.packages("reshape2")
}
require("data.table")
require("reshape2")

r pca example.R

pca example.R
set.seed(2)
x <- 1:100

y <- 20 + 3 * x
e <- rnorm(100, 0, 60)
y <- 20 + 3 * x + e

plot(x,y)
yx.lm <- lm(y ~ x)
lines(x, predict(yx.lm), col="red")

xy.lm <- lm(x ~ y)
lines(predict(xy.lm), y, col="blue")

# so lm() depends on which variable is x and wich is y
# lm minimizes y distance (the error term is y-yhat)

#normalize means and cbind together
xyNorm <- cbind(x=x-mean(x), y=y-mean(y))
plot(xyNorm)

#covariance
xyCov <- cov(xyNorm)
eigenValues <- eigen(xyCov)$values
eigenVectors <- eigen(xyCov)$vectors
eigenValues
eigenVectors

plot(xyNorm, ylim=c(-200,200), xlim=c(-200,200))
lines(xyNorm[x], eigenVectors[2,1]/eigenVectors[1,1] * xyNorm[x])
lines(xyNorm[x], eigenVectors[2,2]/eigenVectors[1,2] * xyNorm[x])

# the largest eigenValue is the first one
# so that's our principal component.
# but the principal component is in normalized terms (mean=0)
# and we want it back in real terms like our starting data
# so let's denormalize it
plot(x,y)
lines(x, (eigenVectors[2,1]/eigenVectors[1,1] * xyNorm[x]) + mean(y))
# that looks right. line through the middle as expected

# what if we bring back our other two regressions?
lines(x, predict(yx.lm), col="red")
lines(predict(xy.lm), y, col="blue")

r pca example.R

pca example.R
set.seed(2)
x <- 1:100

y <- 20 + 3 * x
e <- rnorm(100, 0, 60)
y <- 20 + 3 * x + e

plot(x,y)
yx.lm <- lm(y ~ x)
lines(x, predict(yx.lm), col="red")

xy.lm <- lm(x ~ y)
lines(predict(xy.lm), y, col="blue")

# so lm() depends on which variable is x and wich is y
# lm minimizes y distance (the error term is y-yhat)

#normalize means and cbind together
xyNorm <- cbind(x=x-mean(x), y=y-mean(y))
plot(xyNorm)

#covariance
xyCov <- cov(xyNorm)
eigenValues <- eigen(xyCov)$values
eigenVectors <- eigen(xyCov)$vectors
eigenValues
eigenVectors

plot(xyNorm, ylim=c(-200,200), xlim=c(-200,200))
lines(xyNorm[x], eigenVectors[2,1]/eigenVectors[1,1] * xyNorm[x])
lines(xyNorm[x], eigenVectors[2,2]/eigenVectors[1,2] * xyNorm[x])

# the largest eigenValue is the first one
# so that's our principal component.
# but the principal component is in normalized terms (mean=0)
# and we want it back in real terms like our starting data
# so let's denormalize it
plot(x,y)
lines(x, (eigenVectors[2,1]/eigenVectors[1,1] * xyNorm[x]) + mean(y))
# that looks right. line through the middle as expected

# what if we bring back our other two regressions?
lines(x, predict(yx.lm), col="red")
lines(predict(xy.lm), y, col="blue")

r 我的.Rprofile

我的.Rprofile

rprofile.R
# Place in /Users/Username/.Rprofile
"
sys.source('~/Dropbox/appdata/Rprofile.r')
"


## Create a new invisible environment for all the functions to go in so it doesn't clutter your workspace.
.env <- new.env()


# Annoying syntax
install <- function(l) {
  install.packages(l)
}

library(devtools)
library(ggplot2)
library(plyr)
library(dplyr)
library(stringr)
library(knitr)
library(reshape2)
library(BiocInstaller)


# Load BiocLite
source("http://bioconductor.org/biocLite.R")

install <- function(t) {
  biocLite(as.character(substitute(t)))
}


#-----------------------------------------------------------------------
#                             Functions
#-----------------------------------------------------------------------

## Single character shortcuts for summary() and head().
.env$s <- base::summary
.env$h <- utils::head

## Strip row names from a data frame (stolen from plyr)
.env$unrowname <- function(x) {
    rownames(x) <- NULL
    x
}


## List all functions in a package (also from @_inundata)
.env$lsp <- function(package, all.names = FALSE, pattern) {
    package <- deparse(substitute(package))
    ls(
        pos = paste("package", package, sep = ":"),
        all.names = all.names,
        pattern = pattern
    )
}

## Open Finder to the current directory on mac
.env$macopen <- function(...) if(Sys.info()[1]=="Darwin") system("open .")
.env$o       <- function(...) if(Sys.info()[1]=="Darwin") system("open .")

## Transpose a numeric data frame with ID in first column
.env$tdf <- function(d) {
    row.names(d) <- d[[1]]
	d[[1]] <- NULL
	d <- as.data.frame(t(d))
    d$id <- row.names(d)
    d <- cbind(d[ncol(d)], d[-ncol(d)])
    row.names(d) <- NULL
    d
}

z.test = function(a, mu, var){
  zeta = (mean(a) - mu) / (sqrt(var / length(a)))
  return(zeta)
}
 
## Convert selected columns of a data frame to factor variables
.env$factorcols <- function(d, ...) lapply(d, function(x) factor(x, ...)) 
 
## Returns a logical vector TRUE for elements of X not in Y
"%nin%" <- function(x, y) !(x %in% y) 

# Order variables in a data frame.

corder <- function(df,...) {
  cols <-as.vector(eval(substitute((alist(...)))),mode="character")
  stopifnot(is.data.frame(df))
  df[,c(cols,unlist(setdiff(names(df),cols)))]
}

# Drop Variables from a data frame.
# Usage:
# df <- cdrop(df, <list of vars to drop>)

.env$cdrop <- function(df, ...) {
  cols <-as.vector(eval(substitute((alist(...)))),mode="character")
  stopifnot(is.data.frame(df))
  df[,c(unlist(setdiff(names(df),cols)))]
}

# Preview a data frame in Excel [Only works on Mac]
.env$excel <- function(df) {
  f <- paste0(tempdir(),'/', make.names(deparse(substitute(df))),'.',paste0(sample(letters)[1:5],collapse=""), '.csv')
  write.csv(df,f)
  system(sprintf("open -a 'Microsoft Excel' %s",f))
}

# Sample a random set of rows
.env$randomRows = function(df,n){
  return(df[sample(nrow(df),n),])
}


.env$pp <- function(header = TRUE, ...) {
  # Taken from the psych package; Thank you William Revelle!
  # Type pp() to fetch data from the clipboard.
    MAC <- Sys.info()[1] == "Darwin"
    if (!MAC) {
        if (header) 
            return(read.table(file("clipboard"), header = TRUE, 
                ...))
        else return(read.table(file("clipboard"), ...))
    }
    else {
        if (header) {
            return(read.table(pipe("pbpaste"), header = TRUE, 
                ...))
        }
        else {
            return(read.table(pipe("pbpaste"), ...))
        }
    }
}



attach(.env)
message("\n******************************\nSuccessfully loaded Rprofile.r\n******************************")

r 来自荷马的情节高峰

来自荷马的情节高峰

plot_peaks.R
library(data.table)
library(stringr)
library(splitstackshape)

args <- commandArgs(trailing=T)
if (length(args) == 0) {
  setwd("/Users/dancook/Dropbox/Andersen lab/LabFolders/Dan/ForOthers/Maneeshi")
}

df <- fread("Snail2_Twist_MergedPeaksPS1005kbSummit300bp_Sn_TwEBox.txt")
names(df)[1] <- "Peak"
names(df)[9] <- "Twist1"
names(df)[10] <- "Twist2"
names(df)[11] <- "Snail2"

# Format Columns
for (i in c("Twist1", "Twist2", "Snail2")) {
peak_name <- paste0(i,"_Peak")
strand_name <- paste0(i,"_Strand")
df[[peak_name]] <- sapply(str_split(df[[i]], ")"), function(x) {
  gsub(",NA","",paste0(as.numeric(str_extract(x,"[-0-9]+")), collapse = ","))
})
df[[strand_name]]  <- sapply(str_split(df[[i]], ")"), function(x) {
  gsub("NA|,NA","",paste(gsub(",","",str_extract(x,"(,[+|-],)")), sep=",", collapse=","))
})
}

longform <- function(motif) {
  peak <- paste0(motif,"_Peak")
  strand <- paste0(motif, "_Strand")
  S <- as.data.frame(cbind(Peak=df$Peak,Motif=motif, POS=df[[peak]], STRAND=df[[strand]]))
  S <- concat.split.multiple(S, c("POS", "STRAND"), direction="long")
  filter(S, !is.na(POS))
}

PEAKS <- rbind(longform("Twist1"), longform("Twist2"), longform("Snail2"))


hist(PEAKS$POS[PEAKS$Motif == "Twist1"])
hist(PEAKS$POS[PEAKS$Motif == "Twist2"])
hist(PEAKS$POS[PEAKS$Motif == "Snail2"])


# Plot densities
ggplot(PEAKS) +
  geom_density(aes(x=POS, color=Motif, alpha = 0.05)) +
  facet_wrap(~STRAND, ncol=1,nrow=2) +
  scale_x_continuous(breaks=seq(-200,300,10))

qplot(PEAKS$time)

# Plot densities 

r speed.r

speed.r
speed <- 55
distance <- 2

speeds <- seq(54, 0)

h <- function(x) ceiling((distance / (speed - x)) * 3600)
g <- Vectorize(h)

times = g(speeds)

plot(speeds,times, xlim=c(54, 0), xlab="speed (mph)", ylab="time (s)", col="red")
smoo<-spline(speeds,times)
lines(smoo$x,smoo$y)