r R中的清洁数据

https://www.datacamp.com/courses/cleaning-data-in-r

exploring.R
# View the first 6 rows of data
head(weather)

# View the last 6 rows of data
tail(weather)

# View a condensed summary of the data
str(weather)

# Check the class of bmi
class(bmi)

# Check the dimensions of bmi
dim(bmi)

# View the column names of bmi
names(bmi)

# Check the structure of bmi
str(bmi)

# Load dplyr
library(dplyr)

# Check the structure of bmi, the dplyr way
glimpse(bmi)

# View a summary of bmi
summary(bmi)

# Print bmi to the console
bmi

# View the first 15 rows
head(bmi, 15)

# View the last 10 rows
tail(bmi, 10)

# Histogram of BMIs from 2008
hist(bmi$Y2008)

# Scatter plot comparing BMIs from 1980 to those from 2008
plot(bmi$Y1980, bmi$Y2008)
tidy.R
# Gathering columns into key-value pairs
# Apply gather() to bmi and save the result as bmi_long
bmi_long <- gather(bmi, year, bmi_val, -Country)

# Spreading key-value pairs into columns
# Apply spread() to bmi_long
bmi_wide <- spread(bmi_long, year, bmi_val)

# Separating columns
# Apply separate() to bmi_cc
bmi_cc_clean <- separate(bmi_cc, col = Country_ISO, into = c("Country", "ISO"), sep = "/")

# Uniting columns
# Apply unite() to bmi_cc_clean
bmi_cc <- unite(bmi_cc_clean, Country_ISO, Country, ISO, sep = "-")

#Column headers are values, not variable names
# Gather the month columns
census2 <- gather(census, month, amount, -YEAR)

# Arrange rows by YEAR using dplyr's arrange
census2_arr <- arrange(census2, YEAR)

# Variables are stored in both rows and columns
# Spread the type column
census_long2 <- spread(census_long, type, amount)

# Multiple values are stored in one column
# Separate the yr_month column into two
census_long4 <- separate(census_long3, yr_month, c("year", "month"))
preparing.R
# Variable Types
character: "treatment", "123", "A"
numeric: 23.44, 120, NaN, Inf
integer: 4L, 1123L
factor: factor("Hello"), factor(8)
logical: TRUE, FALSE, NA

# Conversions
# Make this evaluate to "character"
class("TRUE")

# Make this evaluate to "numeric"
class(8484.00)

# Make this evaluate to "integer"
class(99L)

# Make this evaluate to "factor"
class(factor("factor"))

# Make this evaluate to "logical"
class(FALSE)

# Coerce Grades to character
students$Grades <- as.character(students$Grades)

# Coerce Medu to factor
students$Medu <- as.factor(students$Medu)

# Coerce Fedu to factor
students$Fedu <- as.factor(students$Fedu)

# Working with dates
# Load the lubridate package
library(lubridate)

# Parse as date
dmy("17 Sep 2015")

# Parse as date and time (with no seconds!)
mdy_hm("July 15, 2012 12:56")

# Coerce dob to a date (with no time)
students2$dob <- ymd(students2$dob)

# Coerce nurse_visit to a date and time
students2$nurse_visit <- ymd_hms(students2$nurse_visit)

# String manipulation
# Trimming and padding strings
# Load the stringr package
library(stringr)

# Trim all leading and trailing whitespace
str_trim(c("   Filip ", "Nick  ", " Jonathan"))

# Pad these strings with leading zeros
str_pad(c("23485W", "8823453Q", "994Z"), width = 9, side = "left", pad = "0")

# Upper and lower case
# Make states all uppercase and save result to states_upper
states_upper <- toupper(states)

# Make states_upper all lowercase again
tolower(states_upper)

# Finding and replacing strings
# Detect all dates of birth (dob) in 1997
str_detect(students3$dob, "1997")

# In the sex column, replace "F" with "Female" ...
students3$sex <- str_replace(students3$sex, "F", "Female")

# ... and "M" with "Male"
students3$sex <- str_replace(students3$sex, "M", "Male")

# Missing and special values
R - NA
SPSS, SAS - .
? - ""
Inf (infinite)

# Finding missing values
# Call is.na() on the full social_df to spot all NAs
is.na(social_df)

# Use the any() function to ask whether there are any NAs in the data
any(is.na(social_df))

# View a summary() of the dataset
summary(social_df)

# Call table() on the status column
table(social_df$status)

# Dealing with missing values
# Replace all empty strings in status with NA
social_df$status[social_df$status == ""] <- NA

# Print social_df to the console
social_df

# Use complete.cases() to see which rows have no missing values
complete.cases(social_df)

# Use na.omit() to remove all rows with any missing values
na.omit(social_df)

# Outliers and obvious errors
# Dealing with outliers and obvious errors
# Look at a summary() of students3
summary(students3)

# View a histogram of the age variable
hist(students3$age)

# View a histogram of the absences variable
hist(students3$absences)

# View a histogram of absences, but force zeros to be bucketed to the right of zero
hist(students3$absences, right = FALSE)

# View a boxplot of age
boxplot(students3$age)

# View a boxplot of absences
boxplot(students3$absences)
cleaningData.R
### --- SUMMARY ---

## Get a feel for the data
# Verify that weather is a data.frame
class(weather)

# Check the dimensions
dim(weather)

# View the column names
names(weather)

#Summarize the data
# View the structure of the data
str(weather)

# Load dplyr package
library(dplyr)

# Look at the structure using dplyr's glimpse()
glimpse(weather)

# View a summary of the data
summary(weather)

# Take a closer look
# View first 6 rows
head(weather)

# View first 15 rows
head(weather, 15)

# View the last 6 rows
tail(weather)

# View the last 10 rows
tail(weather, 10)

## Column names are values
# Load the tidyr package
library(tidyr)

# Gather the columns
weather2 <- gather(weather, day, value, X1:X31, na.rm = TRUE)

# View the head
head(weather2)

# Values are variable names
# First remove column of row names
without_x <- weather2[, -1]

# Spread the data
weather3 <- spread(without_x, measure, value)

# View the head
head(weather3)

## Clean up dates
# Load the stringr and lubridate packages
library(stringr)
library(lubridate)

# Remove X's from day column
weather3$day <- str_replace(weather3$day, "X", "")

# Unite the year, month, and day columns
weather4 <- unite(weather3, date, year, month, day, sep = "-")

# Convert date column to proper date format using lubridates's ymd()
weather4$date <- ymd(weather4$date)

# Rearrange columns using dplyr's select()
weather5 <- select(weather4, date, Events, CloudCover:WindDirDegrees)

# View the head of weather5
head(weather5)

## A closer look at column types
# View the structure of weather5
str(weather5)

# Examine the first 20 rows of weather5. Are most of the characters numeric?
head(weather5, 20)

# See what happens if we try to convert PrecipitationIn to numeric
as.numeric(weather5$PrecipitationIn)

## Column type conversions
# Replace "T" with "0" (T = trace)
weather5$PrecipitationIn <- str_replace(weather5$PrecipitationIn, "T", "0")

# Convert characters to numerics
weather6 <- mutate_at(weather5, vars(CloudCover:WindDirDegrees), funs(as.numeric))

# Look at result
str(weather6)

## Find missing values
# Count missing values
sum(is.na(weather6))

# Find missing values
summary(weather6)

# Find indices of NAs in Max.Gust.SpeedMPH
ind <- which(is.na(weather6$Max.Gust.SpeedMPH))

# Look at the full rows for records missing Max.Gust.SpeedMPH
weather6[ind, ]

## An obvious error
# Review distributions for all variables
summary(weather6)

# Find row with Max.Humidity of 1000
ind <- which(weather6$Max.Humidity == 1000)

# Look at the data for that day
weather6[ind, ]

# Change 1000 to 100
weather6$Max.Humidity[ind] <- 100

## Another obvious error
# Look at summary of Mean.VisibilityMiles
summary(weather6$Mean.VisibilityMiles)

# Get index of row with -1 value
ind <- which(weather6$Mean.VisibilityMiles == -1)

# Look at full row
weather6[ind, ]

# Set Mean.VisibilityMiles to the appropriate value
weather6$Mean.VisibilityMiles[ind] <- 10

## Check other extreme values
# Review summary of full data once more
summary(weather6)

# Look at histogram for MeanDew.PointF
hist(weather6$MeanDew.PointF)

# Look at histogram for Min.TemperatureF
hist(weather6$Min.TemperatureF)

# Compare to histogram for Mean.TemperatureF
hist(weather6$Mean.TemperatureF)

## Finishing touches
# Clean up column names
names(weather6) <- new_colnames

# Replace empty cells in events column
weather6$events[weather6$events == ""] <- "None"

# Print the first 6 rows of weather6
head(weather6)


r 情节拉伸传奇

plot
PlotStretchLegend <- function(r, breaks, pal, ...){
	plot(r, col=pal(length(breaks) - 1), breaks=breaks, xaxt="n", yaxt="n", legend=F, ...)
	# add a reasonable legend
	legend_at <- round(seq(breaks[2], breaks[length(breaks) - 1], len=7))
	# legend_at_date <- as.Date(legend_at, origin="1970-1-1")
	# legend_labels <- c(paste("<", legend_at_date[1]), as.character(legend_at_date[2:(length(legend_at_date) - 1)]), paste(">", legend_at_date[length(legend_at_date)]))
    legend_labels <- c(paste("<", legend_at[1]), as.character(legend_at[2:(length(legend_at) - 1)]), paste(">", legend_at[length(legend_at)]))
	plot(raster(matrix(legend_at[1]:legend_at[length(legend_at)])), legend.only=T, col=pal(length(breaks)-1), axis.args=list(at=legend_at, labels=legend_labels))
}

r 在R中的table()函数中包含NA

table_na.r
table(mydata$Genre, useNA="always")

r 使用dplyr重命名R中的列

rename_dplyr.r
mydata <- mydata %>% 
  rename(Budget_new = Budget, Genre_new = Genre)

r 将所有列转换为R中的字符

convert_to_char.r
mydata <- mydata %>% 
  mutate_if(is.factor, as.character)

r 更改R公式中的变量

variable-in-formula
#https://www.r-bloggers.com/changing-the-variable-inside-an-r-formula/
data(mtcars)
head(mtcars)
lm(mpg ~ hp, data = mtcars)$coefficients

response_list <- c("mpg", "disp", "drat")

for (y in response_list) {
    lmfit <- lm(as.formula(paste(y, "~ hp")), data = mtcars)
    print(lmfit$coefficients)
}

for (y in response_list) {
    lmfit <- lm(mtcars[[y]] ~ mtcars$hp) 
    print(lmfit$coefficients)
}

for (y in response_list) {
    lmfit <- lm(get(y) ~ hp, data = mtcars)
    print(lmfit$coefficients)
}

for (y in response_list) {
    lmfit <- lm(eval(parse(text = y)) ~ hp, data = mtcars)
    print(lmfit$coefficients)
}

r 使用TeX Latex ggplot2进行刻面包装

facet_wrap_with_tex.R
library(latex2exp)
facet_wrap(~ variable 
           , labeller = as_labeller(TeX, default = label_parsed)

r 治疗持续时间

首先计算。最后。从患者首次访问到最后一次访问

Treatment duration

data Patients;
informat Date date7.;
format Date date7. PatientID Z4.;
input PatientID Date Weight @@;
datalines;
1021 04Jan16  302  1042 06Jan16  285
1053 07Jan16  325  1063 11Jan16  291
1053 01Feb16  299  1021 01Feb16  288
1063 09Feb16  283  1042 16Feb16  279
1021 07Mar16  280  1063 09Mar16  272
1042 28Mar16  272  1021 04Apr16  273
1063 20Apr16  270  1053 28Apr16  289
1053 13May16  295  1063 31May16  269
;
run;

proc sort data=Patients;
   by PatientID Date;
run;
 
data weightLoss;
   set Patients;
   BY PatientID;
   retain startDate startWeight;                 /* RETAIN the starting values */
   if FIRST.PatientID then do;
      startDate = Date; startWeight = Weight;    /* remember the initial values */
   end;
   if LAST.PatientID then do;
      endDate = Date; endWeight = Weight;
      elapsedDays = intck('day', startDate, endDate); /* elapsed time (in days) */
      weightLoss = startWeight - endWeight;           /* weight loss */
      AvgWeightLoss = weightLoss / elapsedDays;       /* average weight loss per day */
      output;                                         /* output only the last record in each group */
   end;
run;
 
proc print noobs; 
   var PatientID elapsedDays startWeight endWeight weightLoss AvgWeightLoss;
run;

r BA760 - 扑克例如,discussion.r

BA760-poker-example-discussion.r
options(stringsAsFactors = FALSE)


#################### setup

## define the vectors
poker_vector <- c(140, -50, 20, -120, 240)
roulette_vector <- c(-24, -50, 100, -350, 10)

## we are covering named vectors in the module 3 Friday
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector

## which days did we make money on poker
selection_vector <- c(poker_vector)>0
selection_vector

## why is this code different (your question)
selection_vector <- poker_vector[c(1:5)>0]

############### explanation 1
c(1:5) # is just a vector, and because you are using :, you actually dont need the c
# c(1:5) == 1:5
# identical(c(1:5), 1:5)

#the vector of 1:5 is always greater than zero
1:5 > 0

## in short, above, you arent using the positional indexes, but just the vector of 1:5 
## compared to zero.  
## This question is asking for the evaluation of the values against zero

# simply you are passing in 5 trues  which returns the entire poker_vector
poker_vector[1:5 > 0] == poker_vector ## one way to check
identical(poker_vector[1:5>0], poker_vector)  #check if the objects are identical

# but when we ask if the vector is > 0
poker_vector > 0

# we see that there are trues and falses
# we use the T/F to select only the positive values, the TRUEs
selection_vector2 <- poker_vector > 0

# and now just return the positive values
poker_vector[selection_vector2]



################# explanation 2
selection_vector3 <- poker_vector[c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")>0]

## inside the poker vector, you are doing a logical comparison of strings against zero
c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")>0

## simply, above, you are comparing a character vector, not the rows (based on name) to zero

## show this a different way
dow <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
dow > 0
is.character(dow)
is.vector(dow)
typeof(dow)

## the comparison of strings to zero is bizarre, but its based on the comparison of characters
## and basically alphasorted check

## however, because they are all TRUEs, the entire vector is returned


##### summary
## it doesnt hurt to use the c(1:5), but I showed above that it is identical to 1:5, so the c is overkill but not a bad thing as you are learning
## you need to compare vector values element wise
## if you wanted to compare random elements , you do the comparison outside 
poker_vector[c(2,4,5)]
poker_vector[c(2,4,5)] > 0
## note above I used the c because i am not defining a vector by a sequence, but am building it with c

r 每列替换NA

replace NA
x[c("a", "b")][is.na(x[c("a", "b")])] <- 0