r Dataformat wijzigen naar数据表

Wanneer je dataframe ineens een matrix o.i.d. blijkt te zijn en bewerkingen遇见了dplyr niet werken,je数据集甚至omzetten naar een data.table。

as.table
#My df dataframe is scaled and centered - the function that produces df returns:

scale(df)
#When I have printed str(df) I have seen attributes saying that it's centered and scaled.

#When converting to data.table it solved the issue:

df %>% as.data.table() %>% dplyr::arrange(obs_id, user_id, scroll_id, timestamp)

r 使用先前的非NA值填充NA行

https://markhneedham.com/blog/2015/06/28/r-dplyr-update-rows-with-earlierprevious-rows-values/

Fill_rows_previous_nonNA

library(zoo)
library(dplyr)

# Gehele dataframe
data.frame(col1 = c(1,2,3,4,5), col2  = c("a", NA, NA , "b", NA)) %>% 
    do(na.locf(.))
  col1 col2
1    1    a
2    2    a
3    3    a
4    4    b
5    5    b

# Specifieke kolommen
data.frame(col1 = c(1,2,3,4,5), col2  = c("a", NA, NA , "b", NA), col3 = c("A", NA, "B", NA, NA)) %>% 
    mutate(col2 = na.locf(col2))
    
    
  col1 col2 col3
1    1    a    A
2    2    a <NA>
3    3    a    B
4    4    b <NA>
5    5    b <NA>

r [R

R
library("tidyverse")
mpg

ggplot(data = mpg) + geom_point(mapping = aes(x=displ,y=hwy))

ggplot(data = mpg)

ggplot(data = mpg) + geom_point(mapping = aes(x=displ,y=hwy,color=class))

ggplot(data = mpg) + geom_point(mapping = aes(x=displ,y=hwy,alpha=class))

ggplot(data = mpg) +
    geom_point(mapping = aes(x=displ,y=hwy))+
    facet_wrap(~ class,nrow=2)

ggplot(data = mpg) +
  geom_point(mapping = aes(x=displ,y=hwy))+
  facet_grid(drv~cyl)

# 練習問題
 

ggplot(data = mpg, mapping = aes(x=displ,y=hwy)) +
  geom_point(mapping = aes(color = class))+
  geom_smooth()


ggplot(data = mpg, mapping = aes(x=displ,y=hwy)) +
  geom_point(mapping = aes(color = class))+
  geom_smooth(
    data=filter(mpg,class=="suv"),
    se = FALSE  
  )

#棒グラフ

ggplot(data=diamonds)+
  geom_bar(mapping=aes(x=cut))

#statでもよい

ggplot(data=diamonds)+
  stat_count(mapping=aes(x=cut))



r R - 从数据帧中抽样行

sample_rows_from_dataframe.r
numRowsToSample = 1000
samp <- df[sample(nrow(df), numRowsToSample), ] # get all columns

r biostring

r_biostring_basic
library(Biostrings)
s <- readDNAStringSet("nm.fasta","fasta")
subs <- subseq(s["seqA"], start=c(1, 2, 3), end=c(3, 6, 5)
               
# convert it to string
subschar <- as.character(subs)              
               

r 文件术语矩阵

文件术语矩阵

DFM.R
library(readr)
library(slam)
library(quanteda); 
library(tidyverse); 
library(RColorBrewer)

###--------------------creating document term matrix
##remove stopwords,  punctuation, symbols, 
##the TEXTINPUT is the input text for the dfm
dfm <- dfm(TEXTINPUT,
           remove = c(stopwords("english")),
           ngrams=1L,
           stem = F,
           remove_numbers = TRUE, 
           remove_punct = TRUE,
           remove_symbols = TRUE)

vdfm <- dfm_trim(dfm, min_termfreq = 10, min_docfreq = 5)
# min_count = remove words used less than x
# min_docfreq = remove words used in less than x docs

topfeatures(vdfm, n = 50)

#Let’s plot two word clouds: one with the raw term frequencies and one with TF-IDF
textplot_wordcloud(vdfm,  scale=c(6, 2), colors=brewer.pal(8, "Dark2"), 
                   random.order = F, rot.per=0.1, max.words=250, main = "Raw Counts")

textplot_wordcloud(dfm_tfidf(vdfm),  scale=c(3.5, .75), colors=brewer.pal(8, "Dark2"), 
                   random.order = F, colormin_size=0.1, max.words=250, main = "TF-IDF")



###--------creating dendogram

numWords <- 50
dfm_weight(dfm_tfidf(vdfm))

wordDfm <- dfm_sort(dfm_weight(dfm_tfidf(vdfm)))
wordDfm <- t(wordDfm)[1:numWords,]  # keep the top numWords words
wordDistMat <- dist(wordDfm)
wordCluster <- hclust(wordDistMat)
plot(wordCluster, xlab="", main="TF-IDF Frequency weighting (First 50 Words)")


###########topic modelling

library(topicmodels)

# we now export to a format that we can run the topic model with
dtm <- convert(vdfm, to="topicmodels")

###########normal approach#########
start.time <- Sys.time()
# estimate LDA with K topics
K <- 20

lda <- LDA(dtm, k = K, method = "Gibbs", 
           control = list(verbose=25L, seed = 123, burnin = 100, iter = 100))
total.time <- Sys.time() - start.time
normal_approach = total.time

r 用for循环和ggsave保存图

带有for循环的ggsave()的Reprex,为类别中的一些因子中的每一个生成一些图。

ggsave-example.R
library(tidyverse)

## load data
data("diamonds")

## get character vector to loop over
cuts <-
  diamonds %>%
  select(cut) %>%
  distinct() %>%
  as_vector() %>%
  as.character()

## generate plots
for(i in cuts){
  print(i)
  
  diamonds %>%
    filter(cut == i) %>%
    ggplot(aes(x = carat))+
    geom_histogram(bins = 30)
  
  ggsave(filename = paste0("diamonds-example-",i,".png"), ## ggsave will automatically save the most recent plot generated
         width = 7, 
         height = 5,
         units = "in")
}

r 模拟阿多尼斯

adonis.r
nsub=30000
nr=nsub
ncol=nsub

dummy_datamatrix <- data.frame(pc1=rnorm(nsub), pc2=rnorm(nsub),studytype=round(runif(nsub)))

distmat <- matrix(rnorm(nsub^2),ncol=nsub,nrow=nsub)


library(vegan)

adon1 <- adonis(distmat~pc1+pc2+studytype,data=dummy_datamatrix,parallel=8)

10000 people = 3.3 g of ram
30k people = 30 g
so ram scales qudratically 
with the number of people.

it said it couldnt allocate a matrix of this size, but i think I have 32 g of ram - either ram being used or r limits somehow

use a grm based on the overlap?

ok seems to top out at 30k people

r 将矢量转换为新的值范围

将矢量转换为新的值范围

dataTranformation.R

dataTranformation <- function(vector,new.range) {
  
  exp.lm <- data.frame( input = sort(unique(vector)),
                        outup = seq(0,1,length.out = length(unique(vector))))
  
  fit <- lm(outup ~ input, data=exp.lm)
  
  new.data <- data.frame(input=vector)
  new.data <- unlist(predict.lm(fit,new.data ))
  new.data <- as.vector(new.data)
  new.data[which(new.data == min(new.data))] <- new.range[1]
  new.data[which(new.data == max(new.data))] <- new.range[2]
  
  return(new.data)
  
}

r 删除图例ggplot

remove_legend_ggplot.R
+ theme(legend.position="none")