r 单调增加

检查向量中的单调增加(https://stackoverflow.com/questions/13093912/how-to-check-if-a-sequence-of-numbers-is-monotonically-increasing-or-decreasing)

monotonicIncrease.R
all(x == cummax(x))

r 将函数行方式应用于多列

rowWise.R
dataTransactions[, monotoneIncreasing:=mapply(function(transactionIdParam, stop.totalConsumptionParam) {
  print(paste0(transactionIdParam, ", ", stop.totalConsumptionParam))
  1},
  transactionId, stop.totalConsumption) 
]

r 用多列dplyr传播

通过多个值列扩展到多个宽##从kieren healy遍历<br/> https://kieranhealy.org/blog/archives/2018/11/06/spreading-multiple-values/ <br/> <br/>#功能多传播

spread_dplyr_datatable.r

gen_cats <- function(x, N = 1000) {
    sample(x, N, replace = TRUE)
}

set.seed(101)
N <- 1000

income <- rnorm(N, 100, 50)

vars <- list(stratum = c(1:8),
          sex = c("M", "F"),
          race =  c("B", "W"),
          educ = c("HS", "BA"))

df <- as_tibble(map_dfc(vars, gen_cats))
df <- add_column(df, income)

## stratum, sex, race, educ, income
# datatable way is easy peasy
data.table::setDT(df)
dt_wide <- data.table::dcast(df, sex + race + stratum ~ educ,
              fun = list(mean, length),
              value.var = "income")
              
              
              
# dplyr
## Simple tidy summary
tv_wide1 <- df %>% group_by(sex, race, stratum, educ) %>%
    summarize(mean_inc = mean(income), N = n())
    
## 1. gather 
## 1. gather()
tv_wide2 <- df %>% group_by(sex, race, stratum, educ) %>%
    summarize(mean_inc = mean(income), N = n()) %>%
    gather(variable, value, -(sex:educ))

tv_wide2

## 2. unite()
tv_wide2 <- df %>% group_by(sex, race, stratum, educ) %>%
    summarize(mean_inc = mean(income), N = n()) %>%
    gather(variable, value, -(sex:educ)) %>%
    unite(temp, educ, variable)

tv_wide2

## 3. spread()
tv_wide2 <- df %>% group_by(sex, race, stratum, educ) %>%
    summarize(mean_inc = mean(income), N = n()) %>%
    gather(variable, value, -(sex:educ)) %>%
    unite(temp, educ, variable) %>%
    spread(temp, value)

tv_wide2


multi_spread <- function(df, key, value) {
    # quote key
    keyq <- rlang::enquo(key)
    # break value vector into quotes
    valueq <- rlang::enquo(value)
    s <- rlang::quos(!!valueq)
    df %>% gather(variable, value, !!!s) %>%
        unite(temp, !!keyq, variable) %>%
        spread(temp, value)
}

## Final version
tv_wide3 <- df %>% group_by(sex, race, stratum, educ) %>%
    summarize(mean_inc = mean(income), N = n()) %>%
    multi_spread(educ, c(mean_inc, N))

tv_wide3

r 读取并组合文件映射purrr

##给出一堆以congress结尾的文件将拉入并使用名称slug

readr_files.r
library(readr)
library(purrr)

filenames >- dir(path = "data/clean",
                 pattern = "*.csv",
                 full.names = TRUE)
data <- filenames %>% map_dfr(read_csv, .id = "congress")

r Lenz加载affymetrix数据

从Lenz研究[引用] GSE10846加载数据

LoadLenzAffy.R
catt("\n", A_scriptname, " started\n")
catt("-----------------------------------------------------\n")
catt <- define.catt(nspace=2)

inst.load.packages("GEOquery", inst.bioclite = TRUE)

get.lenz <- function(rdafile){
	# I use global variables in this function
	
	
	# get the Data from ncbi
	gse10846.dataset <- getGEO("GSE10846")
	# [HG-U133_Plus_2] Affymetrix Human Genome U133 Plus 2.0 Array
	# 48.1 MB
	eset <- gse10846.dataset[[1]]
	expr.lenz.probesets <- exprs(eset) 
	pheno.lenz <- pData(eset)
	
	
	# gene probe IDs to gene symbols
	inst.load.packages("hgu133plus2.db", inst.bioclite = TRUE)
	source.function("FK_getSYMBOL", A_FUNCD)
	expr.lenz <- FK_getSYMBOL(expr.lenz.probesets, "hgu133plus2")
	
	
	##### Now, work through the phenodata
	# 1. remove unnecessary
	colnames.to.remove <- c("title", "status", "submission_date", "last_update_date", "type"
		, "channel_count", "organism_ch1", "characteristics_ch1.2"
		, "characteristics_ch1.3", "characteristics_ch1.4", "characteristics_ch1.5"
		, "molecule_ch1", "extract_protocol_ch1", "extract_protocol_ch1.1", "label_ch1"
		, "label_protocol_ch1", "label_protocol_ch1.1", "taxid_ch1"
		, "hyb_protocol", "hyb_protocol.1", "scan_protocol", "scan_protocol.1", "description"  
		, "data_processing", "data_processing.1"
		, "platform_id", "contact_name", "contact_email", "contact_phone", "contact_fax"
		, "contact_laboratory", "contact_department", "contact_institute"
		, "contact_address", "contact_city", "contact_state", "contact_zip/postal_code"
		, "contact_country", "supplementary_file", "supplementary_file.1", "data_row_count")
	catt("\n\n  Data processing:", unique(as.character(pheno.lenz$data_processing)), "\n")
	catt("Data processing -", unique(as.character(pheno.lenz$data_processing.1)), "\n")
	catt("Platform ID:", unique(as.character(pheno.lenz$platform_id)), "\n")
	
	pheno.lenz <- pheno.lenz[, -which(colnames(pheno.lenz) %in% colnames.to.remove)]
	
	# removed unnecessary, now make the data nice
	pheno.lenz$geo_accession <- as.character(pheno.lenz$geo_accession)
	pheno.lenz$source_name_ch1 <- as.character(pheno.lenz$source_name_ch1)
	
	for(colN in 3:ncol(pheno.lenz)){
		tmp.pheno <- pheno.lenz[[colN]]
		tmp.pheno <- as.character(tmp.pheno)
		tmp.pheno <- sub(".*: ", "", tmp.pheno)
		pheno.lenz[[colN]] <- tmp.pheno
	}
	colnames(pheno.lenz) <- c("geo_accession", "sourcename", "sex", "age", "GCBABC"
							  , "OS_STAT_character", "OS" # I know from website; check NCBI: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE10846
							  ,"chemotherapy", "ECOGperf", "stage", "LDHratio", "nExtranodalSites")
	pheno.lenz$sourcename <- paste0("Source", pheno.lenz$sourcename)
	pheno.lenz$age <- as.numeric(pheno.lenz$age)
	pheno.lenz$GCBABC <- sub(" DLBCL", "", pheno.lenz$GCBABC)
	# for the survevent numbers: See "event" at ?Surv
	tmp.surv <- rep(NA, nrow(pheno.lenz))
	tmp.surv[pheno.lenz$OS_STAT_character == "DEAD"] <- 1
	tmp.surv[pheno.lenz$OS_STAT_character == "ALIVE"] <- 0
	tmp.surv[! pheno.lenz$OS_STAT_character %in% c("DEAD", "ALIVE")] <- NA
	pheno.lenz$OS_STAT <- tmp.surv
	pheno.lenz$OS.years <- as.numeric(pheno.lenz$OS)
	pheno.lenz$chemotherapy <- sub("-Like Regimen", "", pheno.lenz$chemotherapy)
	pheno.lenz$ECOGperf <- as.numeric(pheno.lenz$ECOGperf)
	pheno.lenz$stage <- as.numeric(pheno.lenz$stage)
	pheno.lenz$LDHratio <- as.numeric(pheno.lenz$LDHratio)
	pheno.lenz$nExtranodalSites <- as.numeric(pheno.lenz$nExtranodalSites)
	catt("6 NA - warnings are fine! \n")
	
	save(file=rdafile, expr.lenz, pheno.lenz)
	catt(rdafile, " saved\n")
	
	title<-paste0("Lenz_FirstImpressions.pdf")
	pdf(file.path(A_PDF.DIR, title))
	hist(expr.lenz, main = "All expr.lenz, MAS5")
	dev.off()
	catt(file.path(A_PDF.DIR, title), " saved\n")
}
currentCodechunk <- file.path(A_RDA.DIR, "Lenz.rda")
if(!file.exists(currentCodechunk)){
	get.lenz(currentCodechunk)
	load(currentCodechunk)
}else{
	load(currentCodechunk)
}


catt <- define.catt()
catt("-----------------------------------------------------\n")
catt(A_scriptname, " finished\n")

r 解析CapFriendly NHL Salary网站

parse_cap.r
options(stringsAsFactors = FALSE)

## load the packages
library(tidyverse)
library(rvest)

## settings
BASE_URL = "https://www.capfriendly.com/browse/active/2015&p=%s"
TABLE_NAME = "brwt"
TABLE_XPATH = '//*[@id="brwt"]'
PAGES = 1:29  ## there are 29 pages via the browser

## dataset to store our results
salaries = data.frame()

## loop and parse the data
for (i in PAGES) { 
  ## build the URL and get the page
  URL = sprintf(BASE_URL, i)
  resp = URL %>% read_html() %>% html_nodes(xpath=TABLE_XPATH)
  stats = resp[[1]] %>% html_table()
  ## make all strings to avoid type issues and bind the data
  stats2 = mutate_all(stats, funs(as.character(.)))
  salaries = bind_rows(salaries, stats2)
  ## cleanup
  rm(URL, resp, stats, stats2)
  ## status
  cat("finished ", i, "\n")
} #endfor


## save out the dataset
write_csv(salaries, "~/Downloads/salaries1415.csv", na="")

r 锁定文件以获得更好的并行同步

锁定文件以获得更好的并行同步

lockProcess.R

padlock <- function(dir,fun,id) {
  
  if( missing(id) ) { id <- NULL } 
  if( missing(dir) ) { dir <- NULL } 
  if( missing(fun) ) { fun <- NULL } 
  
  options(warn=-1)
  
  file.t <- paste0("Padlocker#",id,".Lk")
  
  ## ---------------------------
  
  if( fun == "Lock" ) {
    
    write("Locked",file=paste0(dir,"/",file.t),append=FALSE)
    
  }
  
  ## ---------------------------
  
  if( fun == "Unlock" ) {
    
    file.remove( paste0(dir,"/",file.t) )

    if( id == -1 ) {
      
      file.remove( list.files(dir,"Padlocker#",full.names = TRUE) )
      
    }
  }
  
  ## ---------------------------
  
  if( fun == "isLocked" ) {
    
    Locked <- TRUE
    
    fs <- list.files(dir,"Padlocker#",full.names = TRUE)
    
    if( length(fs) != 0 ) {
      Locked <- TRUE
    } 
    if( length(fs) == 0 ) {
      Locked <- FALSE
    } 

    return( Locked  )
    
  }
  
  ## ---------------------------
  
  if( fun == "uniqueLocker" ) {
    
    u.locker <- FALSE
    
    fs <- list.files(dir,"Padlocker#",full.names = TRUE)
    
    if( length(fs) == 0) { u.locker <- FALSE  }
    if( length(fs) > 1) { u.locker <- FALSE  }
    if( length(fs) == 1 ) { 
      
      if( grepl(file.t,fs) ) { u.locker <- TRUE  }  
      
    }

    return( u.locker  )
    
  }
  
  ## ---------------------------
  
  if( fun == "countLockers" ) {
    
    fs <- list.files(dir,"Padlocker#",full.names = TRUE)
    return( length(fs)  )
    
  }
  
  ## ---------------------------
  
  if( fun == "Age" ) {
    
    fs <- list.files(dir,"Padlocker#",full.names = TRUE)
    age.i <- numeric(0)
    
    if( length(fs) > 0 ) {
      
        for( i in 1:length(fs)) {
          
          age <- as.numeric( difftime(Sys.time(), file.info(fs[i] )$atime, units ="mins") )
          age.i <- c(age.i,age)
          
        }
    }
    if( length(fs) == 0 ) {
      age.i <- 0
    }

    return(min(age.i))
    
    }
  
  ## ---------------------------

  options(warn=0)
  
}

r 跟踪R循环和函数中的处理时间

跟踪R循环和函数中的处理时间

tickTack.R
## --------------------------------------------------
## --------------------------------------------------
##
## Track process time in R Loops and Functions
## Muffins 'n' Code
## https://github.com/jorgeassis
##
## --------------------------------------------------
## --------------------------------------------------

tickTack <- function(func,process) { 
  
  if( func == "init") { clock <<- data.frame(Process="Start",Time=Sys.time(), Difference=0) }  
  if( func == "tick") {  clock <<- rbind(clock,data.frame(Process=process,Time=Sys.time(), Difference=Sys.time() - clock$Time[length(clock$Time)] )) }
  if( func == "tack") { clock <<- rbind(clock,data.frame(Process="Print",Time=Sys.time(), Difference=Sys.time() - clock$Time[length(clock$Time)] )) ; print(clock, row.names = FALSE) }
  
  }
  

r #R#min_max2

#R#min_max2

foundational_programming-r-min_max2.R
r_vector <- 1:20

min(r_vector)
#> [1] 1
max(r_vector)
#> [1] 20

r 対象クエリの网络ページへの语句出现频度を算出

··············

mecab.search.result.R
library(rvest)
library(plyr)
library(stringr)
library(RMeCab)


source_url <- "https://www.google.co.jp/search?q=BIツール"
# html <- read_html(source_url, encoding = "UTF-8")
html <- read_html(source_url, encoding = "Shift-JIS")

title_nodes <- html_nodes(html, "h3")
title_nodes <- title_nodes %>% html_nodes("a") %>% html_attr("href")

# TODO: ROOP IN TITLE_NODES
# url <- "http://www.google.co.jp/aclk?sa=l&ai=DChcSEwjbmP_Hy8HeAhWNvGQKHSAqDsIYABAAGgJwag&sig=AOD64_2Zb5Jqz5T-ZGayr2bkUggaSz25xA&ved=0ahUKEwjGsvvHy8HeAhXtHDQIHZQJABEQ0QwIEg&adurl="
for( url in title_nodes ){
  print(paste("Google",url,sep=":"))
  y <- try( html <- read_html(url, encoding = "UTF-8"),silent=FALSE )
  if ( class(y) == "try-error" )next

  list <- html_nodes(html, "a") %>% html_attr("href")
  # TODO: ROOP IN LIST
  # url <- "https://bi.lakeel.com/seminar/"
  for( url2 in list ){
    print(paste("link",url2,sep=":"))
    try( html <- read_html(url2, encoding = "UTF-8"), silent=FALSE )
    if( class(y) == "try-error" )next
  
    body <- html_nodes(html, "body") %>% html_text()
    body <- gsub("\n","",body)
    body <- gsub("\t","",body)
    body <- gsub("\r","",body)
    write(body,"body.txt")
    freq<-RMeCabFreq("body.txt")
    if( url2 == list[1] )freq.all<-freq
    else freq.all<-rbind(freq.all,freq)
    # TODO: freqを足し合わせ最終的に集計する
  }
  if( url == title_nodes[1] )text<-freq.all
  else text<-rbind(text,freq.all)
}

text2<-ddply(text,.(Term,Info1,Info2),summarize,Freqs=sum(Freq))
text2<-subset(text2,Info1=="名詞")