r 在R中创建空数据框

创建包含3列的空数据框

empty_dr.r
empty_df <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(empty_df) <- c("Col1","Col2","Col3")

r 在R中排序数据帧

在R中对数据帧进行排序并重置索引

sort.r
mydata <- mydata[order(mydata$Budget, mydata$Genre, decreasing = TRUE),]
row.names(mydata) <- NULL

r 用平均值替换所有na值

replace_na_many_columns
college_data_processed <- college_data_processed %>%
    group_by(STATENAME) %>%
    mutate_at(vars(-UNITID:-REGION, -INSTNM), ~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x))

r 导出到Netcdf

exportNetcdf.R
## Export to netcdf

mv <- -99999 # missing value to use

dimX <- ncdim_def( "longitudes", "unit", longitudes )
dimY <- ncdim_def( "latitudes", "unit", latitudes )
dimD <- ncdim_def( "depths", "meters", depths )
dimP <- ncdim_def( "predictors", "unit", 1:length(predictors) )

var1d <- ncvar_def( "longitude", "degrees", dimX, mv, compression=9)
var2d <- ncvar_def( "latitude", "degrees", dimY, mv, compression=9)
var3d <- ncvar_def( "depth", "meters", dimD, mv, compression=9)
var4d <- ncvar_def( "predictor", "unit", dimP, mv, compression=9)
var5d <- ncvar_def( "Grid", "c", list(dimX,dimY,dimD,dimP), mv, prec="double", compression=9)

ncFile <- nc_create( "Dependencies/SpatialData/climatologyGrid.nc", list(var1d,var2d,var3d,var4d,var5d))

ncvar_put( ncFile, var1d, longitudes )  
ncvar_put( ncFile, var2d, latitudes )   
ncvar_put( ncFile, var3d, depths )
ncvar_put( ncFile, var4d, 1:length(predictors) )
ncvar_put( ncFile, var5d, climatologyGrid )  

nc_close(ncFile)

r 使用多台计算机的R并行聚类

使用多台计算机的R并行聚类

Cluster.R

# 
# remote
# ssh eliza@10.36.5.135
# ssh-keygen -t rsa
# touch authorized_keys
# cd .ssh ; ls -la
# cat id_rsa.pub
# mkdir -p ~/.ssh && touch ~/.ssh/authorized_keys
# chmod 700 ~/.ssh && chmod 600 ~/.ssh/authorized_keys

# localhost
# ssh-keygen -t rsa
# sudo cat /Users/jorgeassis/.ssh/id_rsa.pub
# sudo ssh -i /Users/jorgeassis/.ssh/id_rsa.pub eliza@10.36.5.135
# cd .ssh ; ls -la
# touch authorized_keys
# cat id_rsa.pub
# chmod u+w authorized_keys
# scp id_rsa.pub eliza@10.36.5.135:~/.ssh/authorized_keys

For others who might be experiencing the same problem, what ended up working was adding 192.0.0.1 localhost to my /etc/hosts file - This will hang normalal parallel shit

library(parallel)
primary <- 'localhost'
machineAddresses <- list(
  list(host=primary,user='jorgeassis',
       ncore=4),
  list(host='10.36.5.135',user='eliza',
       ncore=4)
)

spec <- lapply(machineAddresses,
               function(machine) {
                 rep(list(list(host=machine$host,
                               user=machine$user)),
                     machine$ncore)
               })
spec <- unlist(spec,recursive=FALSE)

parallelCluster <- parallel::makeCluster(type='PSOCK',
                                         master=primary,
                                         spec=spec)
print(parallelCluster)

r 将字符串(YYYYMMDD)转换为日期,然后将其转换为R

date_string.r
## convert string to date
date <- as.Date("01012018", format = "%d%m%Y")

## convert date to string
date_string <- as.character(date, "%d%m%Y" )

r 显示R中所有列的数据类型

str.r
str(mydata)

r 按R计数分组

按流派分组并按流派计算电影

group_count.r
mydata_group <- aggregate(Title ~ Genre, data = mydata, FUN = length)  

r 计算R中的不同值

计算类型的不同值。

count_distinct.R
length(unique(mydata$Genre))

r 计算R中的行数

行数

number_rows.r
nrow(mydata)