组合频率直方图使用两个属性 [英] Combined frequency histogram using two attributes

查看:229
本文介绍了组合频率直方图使用两个属性的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我使用ggplot2为两个不同的参数创建直方图。我现在的方法是附在我的问题的结尾(包括一个数据集,可以从pasetbin.com使用并加载),这会创建


  1. 基于location属性(或者WITHIN或者NOT_WITHIN)记录用户数据的空间分布的频率的可视化图形。
  2. 直方图可视化频率用于根据上下文属性(点击A或点击B)分配登录的用户数据。



<这看起来像下面这样:

 #Load我的示例数据集来自pastebin 
RawDataSet< - read.csv(http://pastebin.com/raw/uKybDy03,sep =;)
#加载软件包
library( plyr)
library(dplyr)
library(reshape2)
library(ggplot2)

###### Create Fre (用户电子邮件),总结,
全部=长度(UserEmail),
Within_area =总和(位置==WITHIN ),
Not_within_area = sum(location ==NOT_WITHIN))
#为唯一标识符创建一列
LocationFrequency< - mutate(LocationFrequency,id = rownames(LocationFrequency))
#重新排序列
LocationFrequency< - LocationFrequency [,c(5,1:4)]
#将id-column格式化为数字(不是字符串)
LocationFrequency [,c 1)]< - sapply(LocationFrequency [,c(1)],as.numeric)
#融化数据
LocationFrequency.m = melt(LocationFrequency,id.var = c(UserEmail, All,id))
#绘制数据
p < - ggplot(LocationFrequency.m,aes(x = id,y = value,fill = variable))+
geom_bar (stat =identity)+
theme_grey(base_size = 16)+
labs(title =显示所有spati分布的直方图)+
实验室(x =用户,y =在区域内/不在区域内的通知交互次数)+
#使用ID而不是UserEmail
scale_x_continuous(断裂= C(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22, 23,24,25,26,27,28,29,30),标签= c(1,2,3,4,5,6,7, 8\" , 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 ,21,22,23,24,25,26,27,28,29,30))

p + labs(fill =位置类型)



#####创建交互信息的频率表
InterationFrequency< - ddply(RawDataSet,。(UserEmail),summary,
All = length(UserEmail),
Clicked_A = sum(context ==Clicked A),
Clicked_B = sum(context = =Clicked B))
#为唯一标识符创建一列
InterationFrequency< - mutate(InterationFrequency,id = rownames(InterationFrequency))
#对列重新排序
InterationFrequency< - InterationFrequency [,c(5,1:4)]
#将id-column格式化为数字(不是字符串)
InterationFrequency [ c(1)]< - sapply(InterationFrequency [,c(1)],as.numeric)
#融化数据
InterationFrequency.m = melt(InterationFrequency,id.var = c(UserEmail ,All,id))
#绘制数据
p < - ggplot(InterationFrequency.m,aes(x = id,y = value,fill = variable))+

labs(title =显示每个用户的所有交互类型分布的直方图)+
实验室(()=
theme_grey(base_size = 16)+
实验室x =User,y =交互次数)+
#使用ID而不是UserEmail
scale_x_continuous(breaks = c(1,2,3,4,5,6,7,8) ,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30),标签= c( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,13 , 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,27,28,29,30))
#更改图例标题
p + labs(fil l =交互类型)






我试图认识到:我怎样才能将两个直方图结合在一个图中?是否有可能为每个部分放置相应的百分比?有些思考像下面的草图,它表示每个用户的观察总数(栏的整个高度),并使用不同的分段来可视化相应的数据。每个小节将被分成若干部分( not_within ),其中每个部分将被分成两个子部分,显示交互类型的百分比(* Clicked A'或点击B )。



解决方案

通过更新描述,我将创建一个包含两部分的组合barplot:负数和负数。为了实现这一点,您必须将数据转换为正确的格式:

 #load needed libraries 
library (dplyr)
库(tidyr)
库(ggplot2)

#汇总数据
new.df< - RawDataSet%>%
group_by(UserEmail,location,context)%>%
tally()%>%
mutate(n2 = n * c(1,-1)[(location ==NOT_WITHIN)+ 1L])%%>%
group_by(UserEmail,location)%>%
mutate(p = c(1,-1)[(location ==NOT_WITHIN)+ 1L] * n / sum(n))

new.df dataframe看起来像:

 > new.df 
来源:本地数据框[90 x 6]
组:UserEmail,位置[54]

UserEmail位置上下文n n2 p
(fctr) (fctr)(fctr)(int)(dbl)(dbl)
1 andre NOT_WITHIN点击A 3 -3 -1.0000000
2 bibi NOT_WITHIN点击A 4 -4 -0.5000000
3 bibi NOT_WITHIN点击B 4 -4 -0.5000000
4 bibi WITHIN点击A 9 9 0.6000000
5 bibi WITHIN点击B 6 6 0.4000000
6 corinn NOT_WITHIN点击A 10 -10 -0.5882353
7 corinn NOT_WITHIN点击B 7 -7 -0.4117647
8 corinn WITHIN点击A 9 9 0.7500000
9 corinn WITHIN点击B 3 3 0.2500000
10 dpfeifer NOT_WITHIN点击A 7 -7 -1.0000000
... ... ... ... ... ... ... ...

接下来,您可以创建一个情节:

  ggplot()+ 
geom_bar(data = new.df [new.df $ location ==NOT_WITHIN,],
aes(x = UserEmail,y = n2,color =darkgreen,fill = context),
size = 1,stat =identity,width = 0.7)+
geom_bar(data = new.df [new.df $ location ==WITHIN,],
aes x = UserEmail,y = n2,color =darkred,fill = context),
size = 1,stat =identity,width = 0.7)+
scale_y_continuous(breaks = seq(-20 ,20,5),
labels = c(20,15,10,5,0,5,10,15,20))+
scale_color_manual(交互位置,
values = c(darkgreen,darkred),
labels = c(NOT_WITHIN,WITHIN))+
scale_fill_manual(交互类型,
values = c (lightyellow,lightblue),
labels = c(Clicked A,Clicked B))+
guides(color = guide_legend(override.aes = list(color = c darkred,darkgreen),
fill_a = list(fill = c(lightyellow,lightblue),
color =black), ,size = 0.5)))+
theme_minimal()+
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5,size = 14),
axis.title = element_blank(),
legend.title = element_text(face =italic,size = 14),
legend.key.size = unit(1,lines),
legend.text = element_text(size = 11))

结果如下:





如果您想使用百分比值,您可以使用 p -column创建一个图:

  ggplot()+ 
geom_bar(data = new.df [new.df $ location ==NOT_WITHIN ,],
aes(x = UserEmail,y = p,color =darkgreen,fill = context),
size = 1,stat =identity,width = 0.7)+
geom_bar(data = new.df [new.df $ location ==WITHIN,],
aes(x = UserEmail,y = p,color =darkred,fill = context),$ b (中断= c(-1,-0.75,-0.5,-0.25,0,0.25,0.5,0.75,1),b =
labels = scales :: percent(c(1,0.75,0.5,0.25,0,0.25,0.5,0.75,1)))+
scale_color_manual(交互位置,
values = c(darkgreen,darkred),
labels = c(NOT_WITHIN,WITHIN))+
scale_fill_manual(交互类型,
values = c (lightyellow,lightblue),
labels = c(Clicked A,Clicked B))+
coord_flip()+
guides(color = guide_legend(override。 aes = list(color = c(darkred,darkgreen),
fill_a = list(fill = c(lightyellow,lightblue),
color =black), ,size = 0.5)))+
theme_minimal(base_size = 14)+
theme(axis.title = element_blank(),
legend.title = element_text(face =italic,size = 14),
legend.key.size = unit(1,lines),
legend.text = element_text(size = 11))

其结果如下:



p>




回应评论



如果您想将文本标签放置在小节中,您还必须计算一个位置变量:

  new.df<  -  RawDataSet%>% 
group_by(UserEmail,location,context)%>%
tally()%>%
mutate(n2 = n * c(1,-1)[(location == NOT_WITHIN)+ 1L])%>%
group_by(UserEmail,location)%>%
mutate(p = c(1,-1)[(location ==NOT_WITHIN)+ 1L] * n / sum(n),
pos =(context ==Clicked A)* p / 2 +(context ==Clicked B)*(c(1,-1)[位置==NOT_WITHIN)+ 1L] *(1 - abs(p)/ 2)))

然后在 geom_bar 的后面添加以下行到 ggplot 代码:

  geom_text(data = new.df,aes(x = UserEmail,y = pos,label = n))



其结果如下:





您也可以使用 label = scales :: percent(而不是 label = n abs(p))来显示百分比。


I'm using ggplot2 to create histograms for two different parameters. My current approach is attached at the end of my question (including a dataset, which can be used and loaded right from pasetbin.com), which creates

  1. a histrogram visualizing the frequency for the spatial distribution of logged user data based on the "location"-attribute (either "WITHIN" or "NOT_WITHIN").
  2. a histogram visualizing the frequency for the distribution of logged user data based on the "context"-attribute (either "Clicked A" or "Clicked B").

This looks like the follwoing:

# Load my example dataset from pastebin
RawDataSet <- read.csv("http://pastebin.com/raw/uKybDy03", sep=";")
# Load packages
library(plyr)
library(dplyr)
library(reshape2)
library(ggplot2)

###### Create Frequency Table for Location-Information
LocationFrequency <- ddply(RawDataSet, .(UserEmail), summarize, 
                           All = length(UserEmail),
                           Within_area = sum(location=="WITHIN"),
                           Not_within_area = sum(location=="NOT_WITHIN"))
# Create a column for unique identifiers
LocationFrequency <- mutate(LocationFrequency, id = rownames(LocationFrequency))
# Reorder columns
LocationFrequency <- LocationFrequency[,c(5,1:4)]
# Format id-column as numbers (not as string)
LocationFrequency[,c(1)] <- sapply(LocationFrequency[, c(1)], as.numeric)
# Melt data
LocationFrequency.m = melt(LocationFrequency, id.var=c("UserEmail","All","id"))
# Plot data
p <- ggplot(LocationFrequency.m, aes(x=id, y=value, fill=variable)) +
  geom_bar(stat="identity") +
  theme_grey(base_size = 16)+
  labs(title="Histogram showing the distribution of all spatial information per user.") + 
  labs(x="User", y="Number of notifications interaction within/not within the area") +
  # using IDs instead of UserEmail
  scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30), labels=c("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30"))
# Change legend Title
p + labs(fill = "Type of location")



##### Create Frequency Table for Interaction-Information
InterationFrequency <- ddply(RawDataSet, .(UserEmail), summarize, 
                             All = length(UserEmail),
                             Clicked_A = sum(context=="Clicked A"),
                             Clicked_B = sum(context=="Clicked B"))
# Create a column for unique identifiers
InterationFrequency <- mutate(InterationFrequency, id = rownames(InterationFrequency))
# Reorder columns
InterationFrequency <- InterationFrequency[,c(5,1:4)]
# Format id-column as numbers (not as string)
InterationFrequency[,c(1)] <- sapply(InterationFrequency[, c(1)], as.numeric)
# Melt data
InterationFrequency.m = melt(InterationFrequency, id.var=c("UserEmail","All","id"))
# Plot data
p <- ggplot(InterationFrequency.m, aes(x=id, y=value, fill=variable)) +
  geom_bar(stat="identity") +
  theme_grey(base_size = 16)+
  labs(title="Histogram showing the distribution of all interaction types per user.") + 
  labs(x="User", y="Number of interaction") +
  # using IDs instead of UserEmail 
  scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30), labels=c("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30"))
  # Change legend Title
  p + labs(fill = "Type of interaction")


But what I'm trying to realize: How can I combine both histograms in only one plot? Would it be somehow possible to place the corressponding percentage for each part? Somethink like the following sketch, which represents the total number of observations per user (the complete height of the bar) and using the different segmentation to visualize the corresponding data. Each bar would be divided into to parts (within and not_within) where each part would be then divided into two subparts showing the percentage of the interaction types (*Clicked A' or Clicked B).

解决方案

With the update description, I would make a combined barplot with two parts: a negative and a positve one. In order to achieve that, you have to get your data into the correct format:

# load needed libraries
library(dplyr)
library(tidyr)
library(ggplot2)

# summarise your data
new.df <- RawDataSet %>% 
  group_by(UserEmail,location,context) %>% 
  tally() %>%
  mutate(n2 = n * c(1,-1)[(location=="NOT_WITHIN")+1L]) %>%
  group_by(UserEmail,location) %>%
  mutate(p = c(1,-1)[(location=="NOT_WITHIN")+1L] * n/sum(n))

The new.df dataframe looks like:

> new.df
Source: local data frame [90 x 6]
Groups: UserEmail, location [54]

   UserEmail   location   context     n    n2          p
      (fctr)     (fctr)    (fctr) (int) (dbl)      (dbl)
1      andre NOT_WITHIN Clicked A     3    -3 -1.0000000
2       bibi NOT_WITHIN Clicked A     4    -4 -0.5000000
3       bibi NOT_WITHIN Clicked B     4    -4 -0.5000000
4       bibi     WITHIN Clicked A     9     9  0.6000000
5       bibi     WITHIN Clicked B     6     6  0.4000000
6     corinn NOT_WITHIN Clicked A    10   -10 -0.5882353
7     corinn NOT_WITHIN Clicked B     7    -7 -0.4117647
8     corinn     WITHIN Clicked A     9     9  0.7500000
9     corinn     WITHIN Clicked B     3     3  0.2500000
10  dpfeifer NOT_WITHIN Clicked A     7    -7 -1.0000000
..       ...        ...       ...   ...   ...        ...

Next you can create a plot with:

ggplot() +
  geom_bar(data = new.df[new.df$location == "NOT_WITHIN",],
           aes(x = UserEmail, y = n2, color = "darkgreen", fill = context),
           size = 1, stat = "identity", width = 0.7) +
  geom_bar(data = new.df[new.df$location == "WITHIN",],
           aes(x = UserEmail, y = n2, color = "darkred", fill = context),
           size = 1, stat = "identity", width = 0.7) +
  scale_y_continuous(breaks = seq(-20,20,5),
                     labels = c(20,15,10,5,0,5,10,15,20)) +
  scale_color_manual("Location of interaction",
                     values = c("darkgreen","darkred"),
                     labels = c("NOT_WITHIN","WITHIN")) +
  scale_fill_manual("Type of interaction",
                    values = c("lightyellow","lightblue"),
                    labels = c("Clicked A","Clicked B")) +
  guides(color = guide_legend(override.aes = list(color = c("darkred","darkgreen"),
                                                  fill = NA, size = 2), reverse = TRUE),
         fill = guide_legend(override.aes = list(fill = c("lightyellow","lightblue"),
                                                 color = "black", size = 0.5))) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14),
        axis.title = element_blank(),
        legend.title = element_text(face = "italic", size = 14),
        legend.key.size = unit(1, "lines"),
        legend.text = element_text(size = 11))

which results in:

If you want to use percentage values, you can use the p-column to make a plot:

ggplot() +
  geom_bar(data = new.df[new.df$location == "NOT_WITHIN",],
           aes(x = UserEmail, y = p, color = "darkgreen", fill = context),
           size = 1, stat = "identity", width = 0.7) +
  geom_bar(data = new.df[new.df$location == "WITHIN",],
           aes(x = UserEmail, y = p, color = "darkred", fill = context),
           size = 1, stat = "identity", width = 0.7) +
  scale_y_continuous(breaks = c(-1,-0.75,-0.5,-0.25,0,0.25,0.5,0.75,1),
                     labels = scales::percent(c(1,0.75,0.5,0.25,0,0.25,0.5,0.75,1))) +
  scale_color_manual("Location of interaction",
                     values = c("darkgreen","darkred"),
                     labels = c("NOT_WITHIN","WITHIN")) +
  scale_fill_manual("Type of interaction",
                    values = c("lightyellow","lightblue"),
                    labels = c("Clicked A","Clicked B")) +
  coord_flip() +
  guides(color = guide_legend(override.aes = list(color = c("darkred","darkgreen"),
                                                  fill = NA, size = 2), reverse = TRUE),
         fill = guide_legend(override.aes = list(fill = c("lightyellow","lightblue"),
                                                 color = "black", size = 0.5))) +
  theme_minimal(base_size = 14) +
  theme(axis.title = element_blank(),
        legend.title = element_text(face = "italic", size = 14),
        legend.key.size = unit(1, "lines"),
        legend.text = element_text(size = 11))

which results in:


In response to the comment

If you want to place the text-labels inside the bars, you will have to calculate a position variable too:

new.df <- RawDataSet %>% 
  group_by(UserEmail,location,context) %>% 
  tally() %>%
  mutate(n2 = n * c(1,-1)[(location=="NOT_WITHIN")+1L]) %>%
  group_by(UserEmail,location) %>%
  mutate(p = c(1,-1)[(location=="NOT_WITHIN")+1L] * n/sum(n),
         pos = (context=="Clicked A")*p/2 + (context=="Clicked B")*(c(1,-1)[(location=="NOT_WITHIN")+1L] * (1 - abs(p)/2)))

Then add the following line to your ggplot code after the geom_bar's:

geom_text(data = new.df, aes(x = UserEmail, y = pos, label = n))

which results in:

Instead of label = n you can also use label = scales::percent(abs(p)) to display the percentages.

这篇关于组合频率直方图使用两个属性的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆