总和并根据第二列有条件地计数 [英] sum and conditionally count based on a second column

查看:125
本文介绍了总和并根据第二列有条件地计数的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我很沮丧,试图解决这个看似简单的问题。我有一个这样的数据集(df):

  structure(list(Year = c(2015L,2015L,2015L, 2015L,2015L,
2015L,2015L,2015L,2015L,2015L,2015L,2015L,2015L,2015L,
2015L,2015L,2015L,2015L,2015L),未知= c(1L,1L,1L ,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L
),Temp = c(21L, 21L,21L,23L,23L,21L,21L,22L,21L,23L,
23L,22L,21L,21L,22L,22L,21L,21L,23L,23L),Obs =结构(c ,
1L,2L,1L,1L,1L,1L,2L,1L,1L,1L,2L,1L,1L,2L,2L,1L,
1L,2L,2L) = c(mdk,sde),class =factor),状态=结构(c(1L,
1L,1L,1L,1L,1L,1L,1L,1L,1L, ,1L,1L,1L,1L,1L,1L,
1L,1L,1L),.Label =ma,class =factor),Zone = c(2L,2L,
2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,
2L,2L),段= c(8L,7L,4L ,17L,18L,7L,2L,12L,1L,17L,
18L,12L,9L,7L,13L,11L,8L,9L,17L,18L),Subseg = c(1L,
3L,3L, 2L,2L,2L,4L,0L,10L,4L,2L,0L,1L,1L,3L,1L,
2L,2L,1L,1L),Wdir =结构(c(2L,2L, ,3L,2L,2L,
1L,2L,3L,3L,1L,2L,2L,1L,1L,2L,2L,1L,1L),标号= c(na b $ bne,nw),class =factor),Wvel =结构(c(1L,1L,2L,
1L,1L,1L,1L,2L,1L,1L, ,2L,1L,1L,2L,2L,1L,1L,2L,
2L),.Label = c(5,na),class =factor),Clouds = (1L,
1L,3L,1L,1L,1L,1L,3L,1L,1L,1L,3L,1L,1L,3L,3L,1L,
1L,3L,3L) .Label = c(1,4,na),class =factor),Temp.1 =结构(c(1L,
1L,3L,1L,1L,1L, ,3L,1L,1L,1L,3L,1L,1L,3L,3L,1L,
1L,3L,3L),.Label = c(20,25,na class =factor),
Species = structure(c(7L,21L,1L,21L,16L,4L,16L,6L,
1L,17L,5L,7L,5L,1L,1L ,6L,7L,7L,24L,5L),.Label = c(ABDU,
ABDU,ABDU,ABDU,ABDU O,CAGO,CAGO,CAGO,
CAGO,GOLD,GOLD,GOLD,GOLD,GOLD,MERG ,
MERG,MERG,MERG,SCOT,SCOT,SCOT,SCOT,
SCOT,SCOT class =factor),Count = c(5L,
1L,150L,3L,20L,8L,5L,10L,5L,1L,20L,10L,2L,2L,
80L,40L ,1L,1000L,2L,20L)),.Names = c(Year,Unknown,
Temp,Obs,State,Zone,Segment ,Wdir,
Wvel,Clouds,Temp.1,Species,Count),row.names = c(666L,
614L,2060L,1738L ,1459L,536L,197L,2467L,98L,1794L,1449L,
2464L,696L,483L,2644L,2350L,686L,844L,2989L,2934L),class =data.frame)

ha标题如下所示:

 年份未知Temp Obs状态区段Sese Subseg Wdir Wvel 
666 2015 1 21 mdk ma 2 8 1 ne 5
614 2015 1 21 mdk ma 2 7 3 ne 5
2060 2015 1 21 sde ma 2 4 3 na na
1738 2015 1 23 mdk ma 2 17 2 nw 5
1459 2015 1 23 mdk ma 2 18 2 nw 5
536 2015 1 21 mdk ma 2 7 2 ne 5
云温Temp.1物种数
666 1 20 CAGO 5
614 1 20 SCOT 1
2060 na na ABDU 150
1738 1 20 SCOT 3
1459 1 20 MERG 20
536 1 20 ABDU 8

除了dplyr之外,我想将每个物种的总和作为一个新的列,当我按细分分组。这是我尝试过的许多变体的最终代码。

  df_group = df%>%
group_by(Segment)%>%
summaryize = round(mean(Temp)),
WDir = round(mean(Wdir)),
ABDU = sum(其中(Species ==ABDU),Count),
CAGO = sum(which(Species ==CAGO),Count),
GOLD = sum(which(Species ==GOLD),Count),
MERG = sum(which(Species == MERG),Count),
SCOT = sum(其中(Species ==SCOT),Count))

这是我得到的(显示正确的格式):

 段温度WDir ABDU CAGO GOLD MERG SCOT 
1 1 21 2 6 5 5 5 5
2 2 21 2 5 5 5 6 5
3 4 21 1 151 150 150 150 150
4 7 21 2 16 11 11 11 12
5 8 21 2 6 9 6 6 6
6 9 21 2 1003 1004 1002 1002 1002

格式和一般想法是我想要的,但是数字并不是我想要的方式。我确定这很简单,但需要一些帮助!谢谢。

解决方案

问题是哪个返回一个向量的位置,但是你没有使用这些子集。所以你得到的总和是除了count变量之外的真实位置。例如

  x<  -  c(a,b,b)
count& - c(10,11,12)
sum(which(c(a,b,b)==b),count)
#38,因为它是2 + 3 + 10 + 11 + 12

我相信你想要的是(或至少一种方式写它):

  sum(ifelse(x ==b,count,0))
#23因为它等于0 + 11 + 12

翻译成dplyr语法,你的例子可能看起来像这样:

  df_group = df%>%
group_by(细分)%>%
总汇( temp = round(mean(Temp)),
WDir = round(mean(Wdir)),
ABDU = sum(ifelse(Species ==ABDU,Count,0L)),
CAGO = sum(ifelse(Species ==CAGO,Count,0L)),
GOLD = sum(ifelse(Species ==GOLD,Count,0L)),
MERG = sum (ifelse(Species ==MERG,Count,0L)),
SCOT = sum(ifelse(Species ==SCOT,Count,0L)))


I have gotten frustrated trying to solve this seemingly simple problem. I have a dataset (df) like this:

structure(list(Year = c(2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 
2015L, 2015L, 2015L, 2015L, 2015L), Unknown = c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), Temp = c(21L, 21L, 21L, 23L, 23L, 21L, 21L, 22L, 21L, 23L, 
23L, 22L, 21L, 21L, 22L, 22L, 21L, 21L, 23L, 23L), Obs = structure(c(1L, 
1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 
1L, 2L, 2L), .Label = c("mdk", "sde"), class = "factor"), State = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = "ma", class = "factor"), Zone = c(2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L), Segment = c(8L, 7L, 4L, 17L, 18L, 7L, 2L, 12L, 1L, 17L, 
18L, 12L, 9L, 7L, 13L, 11L, 8L, 9L, 17L, 18L), Subseg = c(1L, 
3L, 3L, 2L, 2L, 2L, 4L, 0L, 10L, 4L, 2L, 0L, 1L, 1L, 3L, 1L, 
2L, 2L, 1L, 1L), Wdir = structure(c(2L, 2L, 1L, 3L, 3L, 2L, 2L, 
1L, 2L, 3L, 3L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L), .Label = c("na", 
"ne", "nw"), class = "factor"), Wvel = structure(c(1L, 1L, 2L, 
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
2L), .Label = c("5", "na"), class = "factor"), Clouds = structure(c(1L, 
1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L, 
1L, 3L, 3L), .Label = c("1", "4", "na"), class = "factor"), Temp.1 = structure(c(1L, 
1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L, 
1L, 3L, 3L), .Label = c("20", "25", "na"), class = "factor"), 
Species = structure(c(7L, 21L, 1L, 21L, 16L, 4L, 16L, 6L, 
1L, 17L, 5L, 7L, 5L, 1L, 1L, 6L, 7L, 7L, 24L, 5L), .Label = c("ABDU", 
                                                                                                      "ABDU", "ABDU", "ABDU", "ABDU", "CAGO", "CAGO", "CAGO", "CAGO", 
                                                                                                      "CAGO", "GOLD", "GOLD", "GOLD", "GOLD", "GOLD", "MERG", "MERG", 
                                                                                                      "MERG", "MERG", "MERG", "SCOT", "SCOT", "SCOT", "SCOT", 
                                                                                                      "SCOT", "SCOT", "SCOT"), class = "factor"), Count = c(5L, 
1L, 150L, 3L, 20L, 8L, 5L, 10L, 5L, 1L, 20L, 10L, 2L, 2L, 
80L, 40L, 1L, 1000L, 2L, 20L)), .Names = c("Year", "Unknown", 
"Temp", "Obs", "State", "Zone", "Segment", "Subseg", "Wdir", 
"Wvel", "Clouds", "Temp.1", "Species", "Count"), row.names = c(666L, 
614L, 2060L, 1738L, 1459L, 536L, 197L, 2467L, 98L, 1794L, 1449L, 
2464L, 696L, 483L, 2644L, 2350L, 686L, 844L, 2989L, 2934L), class = "data.frame")

With a header that looks like this:

 Year Unknown Temp Obs State Zone Segment Subseg Wdir Wvel
666  2015       1   21 mdk    ma    2       8      1   ne    5
614  2015       1   21 mdk    ma    2       7      3   ne    5
2060 2015       1   21 sde    ma    2       4      3   na   na
1738 2015       1   23 mdk    ma    2      17      2   nw    5
1459 2015       1   23 mdk    ma    2      18      2   nw    5
536  2015       1   21 mdk    ma    2       7      2   ne    5
 Clouds Temp.1 Species Count
666       1     20    CAGO     5
614       1     20    SCOT     1
2060     na     na    ABDU   150
1738      1     20    SCOT     3
1459      1     20    MERG    20
536       1     20    ABDU     8

Among other things within dplyr, I want to get a sum of each species as a new column, when I am grouping by segment. This is the final code I have tried with many variations.

df_group = df %>%
  group_by(Segment) %>%
  summarise(temp = round(mean(Temp)),   
            WDir = round(mean(Wdir)),
            ABDU = sum(which(Species=="ABDU"),Count),
            CAGO = sum(which(Species=="CAGO"),Count),
            GOLD = sum(which(Species=="GOLD"),Count),
            MERG = sum(which(Species=="MERG"),Count),
            SCOT = sum(which(Species=="SCOT"),Count))

And this is what I get (to show correct format):

Segment temp WDir ABDU CAGO GOLD MERG SCOT
1       1   21    2    6    5    5    5    5
2       2   21    2    5    5    5    6    5
3       4   21    1  151  150  150  150  150
4       7   21    2   16   11   11   11   12
5       8   21    2    6    9    6    6    6
6       9   21    2 1003 1004 1002 1002 1002

The format and general idea are what I want, but the numbers are not adding up the way I want them to. I'm sure it is simple but need some help! Thanks.

解决方案

The problem is that which returns a vector of the positions, but you're not using those to subset. So the sum you are getting is of the positions which are true in addition to the count variable. e.g.

x <- c("a", "b", "b")
count <- c(10, 11, 12)
sum(which(c("a", "b", "b") == "b"), count)
# 38 because it is 2 + 3 + 10 + 11 + 12

I believe what you want is (or at least one way of writing it):

sum(ifelse(x == "b", count, 0))
# 23 because it is equal to 0 + 11 + 12

Translating into dplyr syntax, your example could look like this:

df_group = df %>%
  group_by(Segment) %>%
  summarise(temp = round(mean(Temp)),   
            WDir = round(mean(Wdir)),
            ABDU = sum(ifelse(Species=="ABDU", Count, 0L)),
            CAGO = sum(ifelse(Species=="CAGO", Count, 0L)),
            GOLD = sum(ifelse(Species=="GOLD", Count, 0L)),
            MERG = sum(ifelse(Species=="MERG", Count, 0L)),
            SCOT = sum(ifelse(Species=="SCOT", Count, 0L)))

这篇关于总和并根据第二列有条件地计数的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆