通过将R中的var与NA对照分组来分隔某些观察结果之前选择组 [英] select group before certain observations separated by grouping var in R with NA control

查看:73
本文介绍了通过将R中的var与NA对照分组来分隔某些观察结果之前选择组的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我的样品。

  data = structure(list(add = structure(c(1L,1L,1L,1L,1L,1L,1L, 
1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,
1L,1L,1L,2L,2L, 2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,
2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L, 2L),.Label = c( x,
y),class = factor),x1 = c(14L,15L,36L,0L,0L,0L,53L,
10L,39L,27L,67L,25L,19L,49L,53L,64L,61L,12L,75L,34L,
88L,43L,85L,93L,44L,31L,37L,90L,66L,39L, 59L,96L,41L,
23L,20L,26L,69L,28L,35L,96L,87L,82L,70L,68L,26L,12L,
58L,18L,76L,93L,3L, 31L),组=结构(c(2L,2L,2L,
2L,2L,2L,2L,2L,2L,2L,2L,1L,1L,1L,1L,1L,1L,1L,2L ,
2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,
2L,2L,1L,1L,1L ,1L,1L,1L,1L,2L,2L,2L,2L,2L,2L,2L,
2L),.Label = c( female, male),class = factor) )。.names = c( add,
x1, group),class = data.frame,row.na mes = c(NA,-52L))

在此数据中存在组变量(性别(男性和女性)我需要获得统计平均值,以及所有女性之前的所有男性的统计平均值和25%。男的,女的,我不动。我也是女性,我不碰。
,这是根据添加列中的组 x y 进行的分析。
如果对于男性而言,其价值比女性值高出x1> 25%(我们针对男性在女性之前进行了计算),则该值必须由女性之前的男性均值代替。 >

AntoniosK的解决方案非常好

 图书馆(tidyverse)
库(data.table)

数据%&%;%
group_by(add)%&%;%#每次添加都会执行以下操作...
mutate(group2 = rleid(group))%&%;%
group_by(add,group,group2)%&%;%
mutate(MEAN =平均值(x1 [group == male& group2 == 1] ),
Q25 =分位数(x1 [group == male& group2 == 1],0.25))%&%;%
group_by(add)%&%;%# x1值....
mutate(x1 = ifelse(group == male& group2 == 3& x1> unique(Q25 [!is.na(Q25)])),unique(MEAN [!is.na(MEAN)]),x1))%&%;%
ungroup()%&%;%
select(-group2 )%>%
data.frame()

但现在我要替换0值由x1到Na。

  data $ x1 [data $ x1 == 0]<-NA 

之后,当我取消脚本编写时,出现了错误


mutate_impl(.data,点)中的错误:评估错误:缺少
值,并且如果'na.rm'为FALSE,则不允许使用NaN。


该脚本如何通过NA,并且只能使用int值?



edit



  data = structure(list(add = c(11202L,11202L,11202L,11202L,11202L,
11202L, 11202L,11202L,11202L,11202L,11202L,11202L,11202L,
11202L,11202L,11202L,11202L,11202L,11202L,11202L,11202L,
11202L,11202L,11202L,11202L,11202L,11202L, 11202L,11202L,
11202L,11202L,11202L,11202L,11202L,11202L,1120 2L,11202L,
11202L,11202L,11202L,11202L,11202L,11202L,11202L,11202L,
11202L,11202L,11202L,11202L,11202L,11202L,11202L),x1 = c(NA,
2L,NA,NA,NA,NA,NA,NA,NA,NA,NA,1L,NA,1L,1L,NA,NA,NA,
NA,NA,NA,NA,NA,NA ,NA,NA,NA,NA,1L,NA,NA,NA,NA,NA,
NA,NA,NA,NA,NA,NA,NA,NA,3L,NA,NA,NA,NA,1L ,1L,NA,NA,
NA,NA,NA),group = structure(c(2L,2L,2L,2L,2L,2L,2L,
2L,2L,2L,2L, 2L,2L,2L,2L,2L,2L,2L,2L, 2L,2L,2L,2L,
2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,1L,1L,2L,2L,
2L, 2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L),. Label = c( female,
male),class = factor)) ,.names = c( add, x1, group),class = data.frame,row.names = c(NA,
-52L))

库(tidyverse)
库(data.table)

数据%&%;%
group_by(add)%>%
mutate(group2 = rleid(group))%>%
group_by(add,group,group2)%&%;%
mutate(MEAN = mean(x1 [group == male& group2 == 1]),
Q25 =分位数(x1 [group == male& group2 == 1],0.25))%&%;%
group_by(add)%&%;%
mutate(x1 = ifelse(group == male& group2 == 3& x1> unique(Q25 [!is.na(Q25)])),unique(MEAN [!is.na( MEAN)]),x1),
x1 = ifelse(x1 == 0,NA,x1))%&%;%#添加了新代码
ungroup()%&%;%
select (-group2)%&%;%
data.frame()



Edit2



代码结果

 添加x1组MEAN Q25 
x 14.00000男性23.72727 5.0
x 15.00000男性23.72727 5.0
x 36.00000男性23.72727 5.0
x 0.00000男性23.72727 5.0
x 0.00000男性23.72727 5.0
x 0.00000男性23.72727 5.0
x 53.00000男性23.72727 5.0
x 10.00000男性23.72727 5.0
x 39.00000男性23.72727 5.0
x 27.00000男性2 3.72727 5.0
x 67.00000男性23.72727 5.0
x 25.00000女性NaN NA
x 19.00000女性NaN NA
x 49.00000女性NaN NA
x 53.00000女性NaN NA
x 64.00000女性NaN NA
x 61.00000女NaN NA
x 12.00000女NaN NA
x 23.72727男NaN NA
x 23.72727男NaN NA
x 23.72727男NaN NA
x 23.72727男NaN NA
x 23.72727男性NaN NA
x 23.72727男性NaN NA
x 23.72727男性NaN NA
x 23.72727男性NaN NA

之后

 添加x1组
x 94.90男性

女性之后的前四个男性的和= 94.90

解决方案

我添加了一段代码,可以解决您的问题并简要说明错误。



更新后的代码

  data = stru cture(list(add = structure(c(1L,1L,1L,1L,1L,1L,1L,
1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L, 1L,1L,1L,1L,1L,
1L,1L,1L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,
2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L),. Label = c( x,
y),类别= factor ),x1 = c(14L,15L,36L,0L,0L,0L,53L,
10L,39L,27L,67L,25L,19L,49L,53L,64L,61L,12L,75L,34L,
88L,43L,85L,93L,44L,31L,37L,90L,66L,39L,59L,96L,41L,
23L,20L,26L,69L,28L,35L,96L,87L, 82L,70L,68L,26L,12L,
58L,18L,76L,93L,3L,31L),组=结构(c(2L,2L,2L,
2L,2L,2L,2L ,2L,2L,2L,2L,1L,1L,1L,1L,1L,1L,1L,2L,
2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L ,2L,2L,2L,2L,2L,
2L,2L,1L,1L,1L,1L,1L,1L,1L,2L,2L,2L,2L,2L,2L,2L,2L,
b 2L),.Label = c( female, male),class = factor)),.Names = c( add,
x1, group),class = data.frame,row.names = c(NA,-52L))

库( tidyverse)
库(data.table)

数据%&%;%
group_by(add)%>%
mutate(group2 = rleid(group)) %>%
group_by(add,group,group2)%&%;%
mutate(MEAN =平均值(x1 [group == male& group2 == 1]),
Q25 =分位数(x1 [group == male& group2 == 1],0.25))%&%;%
group_by(add)%&%;%
mutate(x1 = ifelse(group == male& group2 == 3& x1> unique(Q25 [!is.na(Q25)])),unique(MEAN [!is.na( MEAN)]),x1),
x1 = ifelse(x1 == 0,NA,x1))%&%;%#添加了新代码
ungroup()%&%;%
select (-group2)%&%;%
data.frame()

错误说明



您必须运行代码的上一部分,最后只需更新 x1 列。之所以会出现此错误,是因为 NA 值打破了平均值分位数您需要做的计算。



另一种方法是在开始时更新 x1 ,然后使用 na.rm = T 进行计算。



对于您提到的新案例,从 NA 值开始的地方, x1 尝试以下操作:

  data%> %% b $ b group_by(add)%>%
mutate(group2 = rleid(group))%>%
group_by (add,group,group2)%&%;%
mutate(MEAN =平均值(x1 [group == male& group2 == 1],na.rm = T),##此处有额外的代码# #
Q25 =分位数(x1 [group == male& group2 == 1],0.25,na.rm = T))%>%##此处的额外代码##
group_by (add)%&%;%
mutate(x1 = ifelse(group == male& group2 == 3& x1> unique(Q25 [!is.na(Q25)])) ,unique(MEAN [!is.na(MEAN)]),x1))%>%
ungroup()%>%
select(-group2)%>%
data.frame()

对于新案例(编辑2),您提到,首先将先前代码的输出另存为 data2

  data2 =数据%>%... 

然后运行:

  data2%&%;%
group_by(add)%&%;%#每个增值
mutate(group2 = rleid( group))%>%#创建了group2
filter(group == male& group2 == 3)%&%;%#只在女性之后保留男性
summarise(SUM = sum(x1 [row_number()< = 4]))#获取前4行
的x1之和
##小动作:2 x 2
#加总和
#< fct> < dbl>
#1 x 94.9
#2 y 107.


My sample.

 data=structure(list(add = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("x", 
"y"), class = "factor"), x1 = c(14L, 15L, 36L, 0L, 0L, 0L, 53L, 
10L, 39L, 27L, 67L, 25L, 19L, 49L, 53L, 64L, 61L, 12L, 75L, 34L, 
88L, 43L, 85L, 93L, 44L, 31L, 37L, 90L, 66L, 39L, 59L, 96L, 41L, 
23L, 20L, 26L, 69L, 28L, 35L, 96L, 87L, 82L, 70L, 68L, 26L, 12L, 
58L, 18L, 76L, 93L, 3L, 31L), group = structure(c(2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L), .Label = c("female", "male"), class = "factor")), .Names = c("add", 
"x1", "group"), class = "data.frame", row.names = c(NA, -52L))

In this data there is group variable (sex (male and female) I need get statistics mean and 25 percentile for ALL male which go before female. Male which after female, i don't touch. Also female i don't touch. this is analysis split by group x and y from add column. if for male that go after female value by x1 > than 25 percentile, which we calculated for male before female, then this value must be replaced by mean for male before female " Female category we dont' touch.

The solutuion of AntoniosK is very good

library(tidyverse)
library(data.table)

data %>%  
  group_by(add) %>%                                           # for each add do the below...
  mutate(group2 = rleid(group)) %>% 
  group_by(add, group, group2) %>%
  mutate(MEAN = mean(x1[group=="male" & group2==1]),               
         Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
  group_by(add) %>%                                            # for each add update x1 values....
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1)) %>%
  ungroup() %>%
  select(-group2) %>%
  data.frame()

but now i want replace 0 value by x1 to Na.

data$x1[data$x1 == 0] <- NA

after it, when i un script, i get the error

Error in mutate_impl(.data, dots) : Evaluation error: missing values and NaN's not allowed if 'na.rm' is FALSE.

How to do, that script passed NA and work only with int value?

edit

data=structure(list(add = c(11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L), x1 = c(NA, 
                                                                                       2L, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, 1L, 1L, NA, NA, NA, 
                                                                                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, 
                                                                                       NA, NA, NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, 1L, 1L, NA, NA, 
                                                                                       NA, NA, NA), group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 
                                                                                                                        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("female", 
                                                                                                                                                                                        "male"), class = "factor")), .Names = c("add", "x1", "group"), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                                           -52L))

library(tidyverse)
library(data.table)

data %>%  
  group_by(add) %>%                                          
  mutate(group2 = rleid(group)) %>% 
  group_by(add, group, group2) %>%
  mutate(MEAN = mean(x1[group=="male" & group2==1]),               
         Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
  group_by(add) %>%                                           
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1),
         x1 = ifelse(x1==0, NA, x1)) %>%  # new code added
  ungroup() %>%
  select(-group2) %>%
  data.frame()

Edit2

result of code

add x1  group   MEAN    Q25
x   14.00000    male    23.72727    5.0
x   15.00000    male    23.72727    5.0
x   36.00000    male    23.72727    5.0
x   0.00000 male    23.72727    5.0
x   0.00000 male    23.72727    5.0
x   0.00000 male    23.72727    5.0
x   53.00000    male    23.72727    5.0
x   10.00000    male    23.72727    5.0
x   39.00000    male    23.72727    5.0
x   27.00000    male    23.72727    5.0
x   67.00000    male    23.72727    5.0
x   25.00000    female  NaN NA
x   19.00000    female  NaN NA
x   49.00000    female  NaN NA
x   53.00000    female  NaN NA
x   64.00000    female  NaN NA
x   61.00000    female  NaN NA
x   12.00000    female  NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA

After

add x1     group
x   94.90   male

sum of first 4 male after female=94.90

解决方案

I've added a piece of code that would solve your issue and a brief explanation of the error.

Updated code

data=structure(list(add = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("x", 
"y"), class = "factor"), x1 = c(14L, 15L, 36L, 0L, 0L, 0L, 53L, 
10L, 39L, 27L, 67L, 25L, 19L, 49L, 53L, 64L, 61L, 12L, 75L, 34L, 
88L, 43L, 85L, 93L, 44L, 31L, 37L, 90L, 66L, 39L, 59L, 96L, 41L, 
23L, 20L, 26L, 69L, 28L, 35L, 96L, 87L, 82L, 70L, 68L, 26L, 12L, 
58L, 18L, 76L, 93L, 3L, 31L), group = structure(c(2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L), .Label = c("female", "male"), class = "factor")), .Names = c("add", 
"x1", "group"), class = "data.frame", row.names = c(NA, -52L))

library(tidyverse)
library(data.table)

data %>%  
  group_by(add) %>%                                          
  mutate(group2 = rleid(group)) %>% 
  group_by(add, group, group2) %>%
  mutate(MEAN = mean(x1[group=="male" & group2==1]),               
         Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
  group_by(add) %>%                                           
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1),
         x1 = ifelse(x1==0, NA, x1)) %>%  # new code added
  ungroup() %>%
  select(-group2) %>%
  data.frame()

Error explanation

You have to run the previous part of the code and in the end you just update the x1 column. You get that error because NA values break the mean and quantile calculations you need to do.

An alternative way would be to update x1 in the beginning and then use na.rm=T for your calculations.

For the new case you mentioned, where you start with NA values for x1 try this:

data %>%  
  group_by(add) %>%                                          
  mutate(group2 = rleid(group)) %>% 
  group_by(add, group, group2) %>%
  mutate(MEAN = mean(x1[group=="male" & group2==1], na.rm = T),      ## extra code here ##    
         Q25 = quantile(x1[group=="male" & group2==1], 0.25, na.rm = T)) %>%  ## extra code here ##
  group_by(add) %>%                                           
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1))%>%
  ungroup() %>%
  select(-group2) %>%
  data.frame()

For the new case (edit 2) you mentioned, first save the output of the previous code as data2:

data2 = data %>% ...

And then run this:

data2 %>%
  group_by(add) %>%                           # for each add value                      
  mutate(group2 = rleid(group)) %>%           # created group2
  filter(group=="male" & group2==3) %>%       # keep only male after female
  summarise(SUM = sum(x1[row_number() <= 4])) # get sum of x1 for first 4 rows

# # A tibble: 2 x 2
#   add     SUM
#   <fct> <dbl>
# 1 x      94.9
# 2 y     107.

这篇关于通过将R中的var与NA对照分组来分隔某些观察结果之前选择组的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆