r R模型创建和摘要统计

modelCreationSummary.r
# Library Dependencies
library(tidyverse)
library(modelr)
library(broom)

# Example Model Logic

model1 <- lm(avgHouseIncome ~ population, data=dfAgg)
model2 <- lm(avgHouseIncome ~ wgtEducation, data=dfAgg)
model3 <- lm(avgHouseIncome ~ population * wgtEducation, data=dfAgg)
model4 <- lm(avgHouseIncome ~ population + wgtEducation, data=dfAgg)

model5 <- rlm(avgHouseIncome ~ population, data=dfAgg)
model6 <- rlm(avgHouseIncome ~ wgtEducation, data=dfAgg)
model7 <- rlm(avgHouseIncome ~ population * wgtEducation, data=dfAgg)
model8 <- rlm(avgHouseIncome ~ population + wgtEducation, data=dfAgg)

model9 <- lm(avgHouseIncome ~ population * wgtEducation + poplation + wgtEducation, data=dfAgg)
model10 <- lm(avgHouseIncome ~ population * wgtEducation + poplation, data=dfAgg)
model11 <- lm(avgHouseIncome ~ population * wgtEducation + wgtEducation, data=dfAgg)

# Combine Model Data

grid1 <- dfAgg %>%
    data_grid(population, wgtEducation) %>%
    gather_predictions(model1, model2, model3, model4)

grid2 <- dfAgg %>%
    data_grid(population, wgtEducation) %>%
    gather_predictions(model5, model6, model7, model8)

grid3 <- dfAgg %>%
    data_grid(population, wgtEducation) %>%
    gather_predictions(model9, model10, mode11)

# Create Model Performance Data Sets

glance1 <- tibble(c(modelNum="model1", glance(model1))
glance2 <- tibble(c(modelNum="model2", glance(model2))
glance3 <- tibble(c(modelNum="model3", glance(model3))
glance4 <- tibble(c(modelNum="model4", glance(model4))
glance5 <- tibble(c(modelNum="model5", glance(model5))
glance6 <- tibble(c(modelNum="model6", glance(model6))
glance7 <- tibble(c(modelNum="model7", glance(model7)) 
glance8 <- tibble(c(modelNum="model8", glance(model8))
glance9 <- tibble(c(modelNum="model9", glance(model9))
glance10 <- tibble(c(modelNum="model10", glance(model10))
glance11 <- tibble(c(modelNum="model11", glance(model11))

glanceAll <- full_join(glance1, glance2, by = modelNum) %>%
    full_join(., glance3, by =modelNum) %>%
    full_join(., glance4, by =modelNum) %>%
    full_join(., glance5, by =modelNum) %>%
    full_join(., glance6, by =modelNum) %>%
    full_join(., glance7, by =modelNum) %>%
    full_join(., glance8, by =modelNum) %>%
    full_join(., glance9, by =modelNum) %>%
    full_join(., glance10, by =modelNum) %>%
    full_join(., glance11, by =modelNum)

r R数据创建和合并

dataCreationMapping.r
# Library Dependencies
library(tidyverse)

# Example Data
df1 <- tibble(
    city=c("Tampa", "Clearwater", "St. Pete", "Dunedin", "Palm Harbor", "Tarpon Spring", "Largo", "Saftey Harbor"),
    population=c(750000, 350000, 450000, 75000, 55000, 40000, 100000, 75000)
)

df2 <- tibble(
    city=c("Tampa", "Clearwater", "St. Pete", "Dunedin", "Palm Harbor", "Tarpon Spring", "Largo", "Saftey Harbor"),
    avgHouseIncome=c(65000, 35000, 45000, 30000, 55000, 25000, 27500, 50000)
)

df3 <- tibble(
    city=c("Tampa", "Clearwater", "St. Pete", "Dunedin", "Palm Harbor", "Tarpon Spring", "Largo", "Saftey Harbor"),
    wgtEducation=c(3,1,2,1,3,1,1,2)
)

# Example Tidy Piping and Joining

dfAgg <- df1 %>%
    left_join(df2, by = city) %>%
    left_join(df3, by = city)

r 用R中的0替换NA,NAN和Inf

rem_nan_inf.r
df[is.na(df)] <- 0
  
is.nan.data.frame <- function(x)
  do.call(cbind, lapply(x, is.nan))
df[is.nan(df)] <- 0

is.infinite.data.frame <- function(x)
  do.call(cbind, lapply(x, is.infinite))
df[is.infinite(df)] <- 0

r R ifelse()中的条件语句

ifelse.r
mydata$is_USA <- ifelse(mydata$Country == "USA",1,0)

r 在lapply函数中保留名称

使用lapply函数但保留名称

keep_lapply_names.R
  # Dummy example to generate 2 lists. One without and other with names.
  # cf : https://stackoverflow.com/questions/9469504/access-and-preserve-list-names-in-lapply-function/18520422#18520422
  
  # Get numeric columns from iris
  cols <- names(which(sapply(iris, is.numeric)))
  
  # Compute mean from each numeric column
  res <- lapply(cols, function(i) {
    mean(iris[,i])
  })
  # [[1]]
  # [1] 5.843333
  # 
  # [[2]]
  # [1] 3.057333
  # 
  # [[3]]
  # [1] 3.758
  # 
  # [[4]]
  # [1] 1.199333
  
  res <- lapply(setNames(cols,cols), function(i) {
    mean(iris[,i])
  })
  
  # $Sepal.Length
  # [1] 5.843333
  # 
  # $Sepal.Width
  # [1] 3.057333
  # 
  # $Petal.Length
  # [1] 3.758
  # 
  # $Petal.Width
  # [1] 1.199333
  

r 获取包含具有最小nlevels的因子的数据集

获取包含具有最小nlevels的因子的数据集

find_datasets.R
load_data <- function(name) {
  env <- new.env()
  data(list = name, envir = env)
  env[[name]]
}

desc <- function(package, exported = TRUE, nlevelsmin = 5) {
  if (exported) {
    datasets <- ls(paste0("package:",package))
  } else {
    datasets <- data(package = package)$results[,"Item"]
  }
  res <- sapply(datasets, function(x) {
    if (exported) {
      df <- get(x)
    } else {
      df <- load_data(x)
    }
    if (inherits(df,"data.frame")) {
      # Keep only variables with factors > nlevelsmin
      df <- df[,sapply(df, nlevels) > nlevelsmin, drop=FALSE]
      if (ncol(df) > 0) {
        sapply(df, nlevels)
      }
    }
  })
  res[sapply(res, is.null)] <- NULL
  res
}

# See (exported) data.frame from package datasets containing factors with nlevels > 10
desc("datasets", nlevelsmin = 15)
# $attenu
# station 
# 117 
# 
# $ChickWeight
# Chick 
# 50 

# See (not exported) data.frame from package mlbench containing factors with nlevels > 20
library(mlbench)
desc("mlbench", exported = FALSE, nlevelsmin = 20)
# $BostonHousing2
# town 
# 92 
# 
# $LetterRecognition
# lettr 
# 26 
# 
# $Ozone
# V2 
# 31 

r 在R中更改列名称

重命名列名称。

change_column.r
colnames(mydata)[colnames(mydata)=="Genre"] <- "Genre2"

r R中的语句(过滤器)

filter_dplyr.r
mydata2 <- mydata[mydata$Genre %in% c("Fantasy","Action"),]

#or

library(dplyr)
mydata2 <- mydata %>% filter(Genre %in% c("Fantasy","Action"))

r 从R中的日期提取月份

month.r
First_updated_month <- format(mydata$First_updated,"%m")

r 将字符串列表转换为R中的数字

convert_string_num.r
x <- c("1","2","3")
x_num <- as.numeric(x)