# View the first 6 rows of data
head(weather)
# View the last 6 rows of data
tail(weather)
# View a condensed summary of the data
str(weather)
# Check the class of bmi
class(bmi)
# Check the dimensions of bmi
dim(bmi)
# View the column names of bmi
names(bmi)
# Check the structure of bmi
str(bmi)
# Load dplyr
library(dplyr)
# Check the structure of bmi, the dplyr way
glimpse(bmi)
# View a summary of bmi
summary(bmi)
# Print bmi to the console
bmi
# View the first 15 rows
head(bmi, 15)
# View the last 10 rows
tail(bmi, 10)
# Histogram of BMIs from 2008
hist(bmi$Y2008)
# Scatter plot comparing BMIs from 1980 to those from 2008
plot(bmi$Y1980, bmi$Y2008)
tidy.R
# Gathering columns into key-value pairs
# Apply gather() to bmi and save the result as bmi_long
bmi_long <- gather(bmi, year, bmi_val, -Country)
# Spreading key-value pairs into columns
# Apply spread() to bmi_long
bmi_wide <- spread(bmi_long, year, bmi_val)
# Separating columns
# Apply separate() to bmi_cc
bmi_cc_clean <- separate(bmi_cc, col = Country_ISO, into = c("Country", "ISO"), sep = "/")
# Uniting columns
# Apply unite() to bmi_cc_clean
bmi_cc <- unite(bmi_cc_clean, Country_ISO, Country, ISO, sep = "-")
#Column headers are values, not variable names
# Gather the month columns
census2 <- gather(census, month, amount, -YEAR)
# Arrange rows by YEAR using dplyr's arrange
census2_arr <- arrange(census2, YEAR)
# Variables are stored in both rows and columns
# Spread the type column
census_long2 <- spread(census_long, type, amount)
# Multiple values are stored in one column
# Separate the yr_month column into two
census_long4 <- separate(census_long3, yr_month, c("year", "month"))
preparing.R
# Variable Types
character: "treatment", "123", "A"
numeric: 23.44, 120, NaN, Inf
integer: 4L, 1123L
factor: factor("Hello"), factor(8)
logical: TRUE, FALSE, NA
# Conversions
# Make this evaluate to "character"
class("TRUE")
# Make this evaluate to "numeric"
class(8484.00)
# Make this evaluate to "integer"
class(99L)
# Make this evaluate to "factor"
class(factor("factor"))
# Make this evaluate to "logical"
class(FALSE)
# Coerce Grades to character
students$Grades <- as.character(students$Grades)
# Coerce Medu to factor
students$Medu <- as.factor(students$Medu)
# Coerce Fedu to factor
students$Fedu <- as.factor(students$Fedu)
# Working with dates
# Load the lubridate package
library(lubridate)
# Parse as date
dmy("17 Sep 2015")
# Parse as date and time (with no seconds!)
mdy_hm("July 15, 2012 12:56")
# Coerce dob to a date (with no time)
students2$dob <- ymd(students2$dob)
# Coerce nurse_visit to a date and time
students2$nurse_visit <- ymd_hms(students2$nurse_visit)
# String manipulation
# Trimming and padding strings
# Load the stringr package
library(stringr)
# Trim all leading and trailing whitespace
str_trim(c(" Filip ", "Nick ", " Jonathan"))
# Pad these strings with leading zeros
str_pad(c("23485W", "8823453Q", "994Z"), width = 9, side = "left", pad = "0")
# Upper and lower case
# Make states all uppercase and save result to states_upper
states_upper <- toupper(states)
# Make states_upper all lowercase again
tolower(states_upper)
# Finding and replacing strings
# Detect all dates of birth (dob) in 1997
str_detect(students3$dob, "1997")
# In the sex column, replace "F" with "Female" ...
students3$sex <- str_replace(students3$sex, "F", "Female")
# ... and "M" with "Male"
students3$sex <- str_replace(students3$sex, "M", "Male")
# Missing and special values
R - NA
SPSS, SAS - .
? - ""
Inf (infinite)
# Finding missing values
# Call is.na() on the full social_df to spot all NAs
is.na(social_df)
# Use the any() function to ask whether there are any NAs in the data
any(is.na(social_df))
# View a summary() of the dataset
summary(social_df)
# Call table() on the status column
table(social_df$status)
# Dealing with missing values
# Replace all empty strings in status with NA
social_df$status[social_df$status == ""] <- NA
# Print social_df to the console
social_df
# Use complete.cases() to see which rows have no missing values
complete.cases(social_df)
# Use na.omit() to remove all rows with any missing values
na.omit(social_df)
# Outliers and obvious errors
# Dealing with outliers and obvious errors
# Look at a summary() of students3
summary(students3)
# View a histogram of the age variable
hist(students3$age)
# View a histogram of the absences variable
hist(students3$absences)
# View a histogram of absences, but force zeros to be bucketed to the right of zero
hist(students3$absences, right = FALSE)
# View a boxplot of age
boxplot(students3$age)
# View a boxplot of absences
boxplot(students3$absences)
cleaningData.R
### --- SUMMARY ---
## Get a feel for the data
# Verify that weather is a data.frame
class(weather)
# Check the dimensions
dim(weather)
# View the column names
names(weather)
#Summarize the data
# View the structure of the data
str(weather)
# Load dplyr package
library(dplyr)
# Look at the structure using dplyr's glimpse()
glimpse(weather)
# View a summary of the data
summary(weather)
# Take a closer look
# View first 6 rows
head(weather)
# View first 15 rows
head(weather, 15)
# View the last 6 rows
tail(weather)
# View the last 10 rows
tail(weather, 10)
## Column names are values
# Load the tidyr package
library(tidyr)
# Gather the columns
weather2 <- gather(weather, day, value, X1:X31, na.rm = TRUE)
# View the head
head(weather2)
# Values are variable names
# First remove column of row names
without_x <- weather2[, -1]
# Spread the data
weather3 <- spread(without_x, measure, value)
# View the head
head(weather3)
## Clean up dates
# Load the stringr and lubridate packages
library(stringr)
library(lubridate)
# Remove X's from day column
weather3$day <- str_replace(weather3$day, "X", "")
# Unite the year, month, and day columns
weather4 <- unite(weather3, date, year, month, day, sep = "-")
# Convert date column to proper date format using lubridates's ymd()
weather4$date <- ymd(weather4$date)
# Rearrange columns using dplyr's select()
weather5 <- select(weather4, date, Events, CloudCover:WindDirDegrees)
# View the head of weather5
head(weather5)
## A closer look at column types
# View the structure of weather5
str(weather5)
# Examine the first 20 rows of weather5. Are most of the characters numeric?
head(weather5, 20)
# See what happens if we try to convert PrecipitationIn to numeric
as.numeric(weather5$PrecipitationIn)
## Column type conversions
# Replace "T" with "0" (T = trace)
weather5$PrecipitationIn <- str_replace(weather5$PrecipitationIn, "T", "0")
# Convert characters to numerics
weather6 <- mutate_at(weather5, vars(CloudCover:WindDirDegrees), funs(as.numeric))
# Look at result
str(weather6)
## Find missing values
# Count missing values
sum(is.na(weather6))
# Find missing values
summary(weather6)
# Find indices of NAs in Max.Gust.SpeedMPH
ind <- which(is.na(weather6$Max.Gust.SpeedMPH))
# Look at the full rows for records missing Max.Gust.SpeedMPH
weather6[ind, ]
## An obvious error
# Review distributions for all variables
summary(weather6)
# Find row with Max.Humidity of 1000
ind <- which(weather6$Max.Humidity == 1000)
# Look at the data for that day
weather6[ind, ]
# Change 1000 to 100
weather6$Max.Humidity[ind] <- 100
## Another obvious error
# Look at summary of Mean.VisibilityMiles
summary(weather6$Mean.VisibilityMiles)
# Get index of row with -1 value
ind <- which(weather6$Mean.VisibilityMiles == -1)
# Look at full row
weather6[ind, ]
# Set Mean.VisibilityMiles to the appropriate value
weather6$Mean.VisibilityMiles[ind] <- 10
## Check other extreme values
# Review summary of full data once more
summary(weather6)
# Look at histogram for MeanDew.PointF
hist(weather6$MeanDew.PointF)
# Look at histogram for Min.TemperatureF
hist(weather6$Min.TemperatureF)
# Compare to histogram for Mean.TemperatureF
hist(weather6$Mean.TemperatureF)
## Finishing touches
# Clean up column names
names(weather6) <- new_colnames
# Replace empty cells in events column
weather6$events[weather6$events == ""] <- "None"
# Print the first 6 rows of weather6
head(weather6)
options(stringsAsFactors = FALSE)
#################### setup
## define the vectors
poker_vector <- c(140, -50, 20, -120, 240)
roulette_vector <- c(-24, -50, 100, -350, 10)
## we are covering named vectors in the module 3 Friday
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
## which days did we make money on poker
selection_vector <- c(poker_vector)>0
selection_vector
## why is this code different (your question)
selection_vector <- poker_vector[c(1:5)>0]
############### explanation 1
c(1:5) # is just a vector, and because you are using :, you actually dont need the c
# c(1:5) == 1:5
# identical(c(1:5), 1:5)
#the vector of 1:5 is always greater than zero
1:5 > 0
## in short, above, you arent using the positional indexes, but just the vector of 1:5
## compared to zero.
## This question is asking for the evaluation of the values against zero
# simply you are passing in 5 trues which returns the entire poker_vector
poker_vector[1:5 > 0] == poker_vector ## one way to check
identical(poker_vector[1:5>0], poker_vector) #check if the objects are identical
# but when we ask if the vector is > 0
poker_vector > 0
# we see that there are trues and falses
# we use the T/F to select only the positive values, the TRUEs
selection_vector2 <- poker_vector > 0
# and now just return the positive values
poker_vector[selection_vector2]
################# explanation 2
selection_vector3 <- poker_vector[c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")>0]
## inside the poker vector, you are doing a logical comparison of strings against zero
c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")>0
## simply, above, you are comparing a character vector, not the rows (based on name) to zero
## show this a different way
dow <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
dow > 0
is.character(dow)
is.vector(dow)
typeof(dow)
## the comparison of strings to zero is bizarre, but its based on the comparison of characters
## and basically alphasorted check
## however, because they are all TRUEs, the entire vector is returned
##### summary
## it doesnt hurt to use the c(1:5), but I showed above that it is identical to 1:5, so the c is overkill but not a bad thing as you are learning
## you need to compare vector values element wise
## if you wanted to compare random elements , you do the comparison outside
poker_vector[c(2,4,5)]
poker_vector[c(2,4,5)] > 0
## note above I used the c because i am not defining a vector by a sequence, but am building it with c