R使用pivot_longer将名称值对从宽到长整形 [英] R reshape name value pairs from wide to long using pivot_longer
问题描述
我正在尝试弄清楚如何使用 dplyr
和 pivot_longer
将政党名称的数据集从宽变长.
对于每个 Party_ID
,都有许多附加的常量列("Party_Name_Short","Party_Name","Country","Party_in_orig_title")
,以及一些随时间变化的因素:选举,日期,重命名,原因,Party_Title,联盟,member_parties,拆分,parent_party,合并,child_party,继任者,前任
.缔约方中最多记录11次时变因子,这通过公司名称中的索引反映出来.
为了提供示例,我为每个参与方选择了前三个时变列和5个随机行的示例:
structure(list(Party_Name_Short = c("LZJ-PS","ZiZi","MNR","MDP","E200"),Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",Živizid",国家共和党运动",葡萄牙运动会""Erakond Eesti 200"),国家/地区= c("SVN","HRV","FRA","PRT","EST"),Party_ID = c(1987,2612,1263,1281,2720),Party_in_orig_title = c(0,0,0,0,0),Date1 = c(2011,NA,1999,1987,NA),Rename1 = c("Lista Zorana Jankovica – Pozitivna Slovenija",NA,国家运动","ID",NA),原因1 = c(基金会",NA,从FN拆分",拆分",NA),Party_Title1 = c(0,NA,0,0,NA),联盟1 = c(0,NA,0,0,NA),member_parties1 = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character_),split1 = c(0,NA,1,1,NA),parent_party1 = c(NA,NA,"FN","MDP",NA),合并1 = c(0,NA,0,0,NA),child_party1 = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character_),后继者1 = c(0,NA,0,0,NA),前任者1 = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character_),Date2 = c(2012,NA,NA,NA,NA),重命名2 = c("Pozitivna Slovenija",NA,NA,NA,NA),Reason2 = c(重命名",NA,NA,NA,NA),Party_Title2 = c(0,NA,NA,NA,NA),Alliance2 = c(0,NA,NA,NA,NA),member_parties2 = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character_),split2 = c(0,NA,NA,NA,NA),parent_party2 = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character_),merge2 = c(0,NA,NA,NA,NA),child_party2 = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character_),后继2 = c(0,NA,NA,NA,NA),前任2 = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character _),Date3 = c(2014,NA,NA,NA,NA),重命名3 = c("ZaAB",NA,NA,NA,NA),Reason3 = c("split",NA,NA,NA,NA),Party_Title3 = c(0,NA,NA,NA,NA),Alliance3 = c(0,NA,NA,NA,NA),member_parties3 = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character _),split3 = c(1,NA,NA,NA,NA),parent_party3 = c("LZJ-PS",NA,NA,NA,NA),merge3 = c(0,NA,NA,NA,NA),child_party3 = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character_),后继者3 = c(0,NA,NA,NA,NA),前任者3 = c(NA_character_,NA_character_,NA_character_,NA_character_,NA_character_),选举1 =结构(c(15309,16740,11839,6390,17956),class ="Date"),选举2 = structure(c(16252,NA,NA,NA,NA),类=日期"),选举3 =结构(c(16344,NA,NA,NA,NA),class ="Date")),row.names = c(NA,-5L),class = c("tbl_df","tbl","data.frame"))
我希望数据遵循长"结构,其中每个party_id和常数因子重复11次,并且随时间变化的因子只有一列.在此处制定的最受好评的答案之后,我尝试了以下命令的不同变体:
ivot_longer(cols = starts_with(c("election","Date","Rename","Reason","Party_Title",联盟",成员党",分裂",父母党",合并","child_party",继任者",前任")),names_to = c(.value","election","Date","Rename","Reason","Party_Title",联盟",成员党",分裂",父母党","merger","child_party","successor","predecessor"),names_sep ="_")%&%select(-matches("election [1-9]"),-matches("Date [1-9]"),-matches("Rename [1-9]"),-matches("Reason [1-9]"),-matches("alliance [1-9]"),-matches("member_parties [1-9]"),-matches("split [1-9]"),-matches("parent_party [1-9]"),-matches("merger [1-9]"),-matches("child_party [1-9]"),-matches("successor [1-9]"),-matches("predecessor [1-9]"),-matches("Party_Title [1-9]"),-matches("election1 [0-2]"),-matches("Date1 [0-2]"),-matches("Rename1 [0-2]")),-matches("Reason1 [0-2]"),-matches("alliance1 [0-2]"),-matches("member_parties1 [0-2]"),-matches("split1 [0-2]"),-matches("parent_party1 [0-2]"),-matches("merger1 [0-2]"),-matches("child_party1 [0-2]"),-matches("successor1 [0-2]"),-matches("predecessor1 [0-2]"),-matches("Party_Title1 [0-2]"))
但是,由于某些原因,我缺少了很多值,并且没有达到我想要的数据形状.如果您对如何执行此操作有任何想法,我将不胜感激.谢谢!
更新:
我希望最终输出看起来像这样:
structure(list(Party_Name_Short = c("LZJ-PS","ZiZi","MNR","MDP","E200","LZJ-PS","ZiZi","MNR","MDP","E200","LZJ-PS","ZiZi","MNR","MDP","E200"),Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",Živizid",国家共和党运动",葡萄牙运动会""Erakond Eesti 200","Lista Zorana Jankovica – Pozitivna Slovenija",Živizid",国家共和党运动",葡萄牙运动会""Erakond Eesti 200","Lista Zorana Jankovica – Pozitivna Slovenija",国家革命运动",国家共和党运动",葡萄牙民主运动""Erakond Eesti 200"),国家/地区= c("SVN","HRV","FRA","PRT","EST","SVN","HRV","FRA","PRT","EST","SVN","HRV","FRA","PRT","EST"),Party_ID = c(1987,2612,1263,1281,2720,1987,2612、1263、1281、2720、1987、2612、1263、1281、2720),Party_in_orig_title = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),时间= c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3),Date = c(2011,NA,1999,1987,NA,2012,NA,NA,NA,NA,2014,NA,NA,NA,NA),重命名为c("Lista Zorana Jankovica – Pozitivna Slovenija",NA,国家共和党运动","ID",NA,"Pozitivna Slovenija",NA,NA,NA,NA,"ZaAB",NA,NA,NA,NA),原因= c(基金会",NA,从FN拆分",拆分",NA,重命名",NA,NA,NA,NA,"split",NA,NA,NA,NA),Party_Title = c(0,NA,0,0,NA,0,NA,NA,NA,NA,0,NA,NA,NA,NA),联盟= c(0,NA,0,0,NA,0,NA,NA,NA,NA,0,NA,NA,NA,NA),member_parties = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),split = c(0,NA,1,1,NA,0,NA,NA,NA,NA,1,NA,NA,NA,NA),parent_party = c(NA,NA,"FN","MDP",NA,NA,NA,NA,NA,NA,"LZJ-PS",NA,NA,NA,NA),合并= c(0,NA,0,0,NA,0,NA,NA,NA,NA,0,NA,NA,NA,NA),child_party = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),后继= c(0,NA,0,0,NA,0,NA,NA,NA,NA,0,NA,NA,NA,NA),前身= c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),选择=结构(c(1322697600,1446336000、1022889600、552096000、1551398400、1404172800,NA,NA,NA,NA,1412121600,NA,NA,NA,NA),类别= c("POSIXct","POSIXt"),tzone ="UTC")),row.names = c(NA,-15L),class = c("tbl_df","tbl","data.frame"))
注意:新添加的 time
列,请注意,这只是出于示例目的,具有三个时变因子,而实际上数据中有11个.
将 pivot_longer
与 names_sep
一起使用,可以在数字末尾和数字末尾之间进行拆分字符串
library(tidyr)图书馆(dplyr)df1%>%ivot_longer(cols = matchs('\\ d + $'),names_to = c(.value",'time'),names_sep =(?< = \\ D)(?= \\ d + $)")%>%安排(时间)#小标题:15 x 19#Party_Name_Short Party_Name国家Party_ID Party_in_orig_t…时间日期重命名原因Party_Title联盟member_parties拆分#< chr>< chr>< chr>< dbl>< dbl>< chr>< dbl>< chr>< chr>< dbl>< dbl>< chr>< dbl>#1 LZJ-PS Lista Zor…SVN 1987 0 1 2011 Lista…发现…0 0< NA>0#2 ZiZiŽivizid HRV 2612 0 1 NA< NA>< NA>NA NA< NA>不适用#3 MNR运动…FRA 1263 0 1 1999 Mouve…分流…0 0< NA>1个#4 MDP Movimento…PRT 1281 0 1 1987 ID split 0 0< NA>1个#5 E200 Erakond E…EST 2720 0 1 NA< NA>< NA>NA NA< NA>不适用#6 LZJ-PS Lista Zor…SVN 1987 0 2 2012 Pozit…renam…0 0< NA>0#7 ZiZiŽivizid HRV 2612 0 2 NA< NA>< NA>NA NA< NA>不适用#8 MNR运动…FRA 1263 0 2 NA< NA>< NA>NA NA< NA>不适用#9 MDP Movimento…PRT 1281 0 2 NA< NA>< NA>NA NA< NA>不适用#10 E200 Erakond E…EST 2720 0 2 NA< NA>< NA>NA NA< NA>不适用#11 LZJ-PS Lista Zor…SVN 1987 0 3 2014 ZaAB split 0 0< NA>1个#12 ZiZiŽivizid HRV 2612 0 3 NA< NA>< NA>NA NA< NA>不适用#13 MNR运动…FRA 1263 0 3 NA< NA>< NA>NA NA< NA>不适用#14 MDP Movimento…PRT 1281 0 3 NA< NA>< NA>NA NA< NA>不适用#15 E200 Erakond E…EST 2720 0 3 NA< NA>< NA>NA NA< NA>不适用#…还有6个变量:parent_party< chr> ;、合并< dbl>,child_party< chr> ;、后继者< dbl> ;、前任< chr> ;、选举< date>
I am trying to figure out how to reshape a dataset of the names of political parties from wide to long using dplyr
and pivot_longer
.
For each Party_ID
, there is a number of constant columns attached (Party_Name_Short, Party_Name, Country, Party_in_orig_title)
and a number of time changing factors as well: election, Date, Rename, Reason, Party_Title, alliance, member_parties, split, parent_party, merger, child_party, successor, predecessor
. The time changing factors were recorded up to 11 times for each party, as reflected by the index in the colname.
In order to provide a sample I selected the first three time changing columns for each party and a sample of 5 random rows:
structure(list(Party_Name_Short = c("LZJ-PS", "ZiZi", "MNR",
"MDP", "E200"), Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200"), Country = c("SVN", "HRV", "FRA", "PRT",
"EST"), Party_ID = c(1987, 2612, 1263, 1281, 2720), Party_in_orig_title = c(0,
0, 0, 0, 0), Date1 = c(2011, NA, 1999, 1987, NA), Rename1 = c("Lista Zorana Jankovica – Pozitivna Slovenija",
NA, "Mouvement national républicain", "ID", NA), Reason1 = c("foundation",
NA, "split from FN", "split", NA), Party_Title1 = c(0, NA, 0,
0, NA), alliance1 = c(0, NA, 0, 0, NA), member_parties1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_),
split1 = c(0, NA, 1, 1, NA), parent_party1 = c(NA, NA, "FN",
"MDP", NA), merger1 = c(0, NA, 0, 0, NA), child_party1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), successor1 = c(0, NA, 0, 0, NA), predecessor1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), Date2 = c(2012, NA, NA, NA, NA), Rename2 = c("Pozitivna Slovenija",
NA, NA, NA, NA), Reason2 = c("renamed", NA, NA, NA, NA),
Party_Title2 = c(0, NA, NA, NA, NA), alliance2 = c(0, NA,
NA, NA, NA), member_parties2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), split2 = c(0,
NA, NA, NA, NA), parent_party2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), merger2 = c(0,
NA, NA, NA, NA), child_party2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), successor2 = c(0,
NA, NA, NA, NA), predecessor2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), Date3 = c(2014,
NA, NA, NA, NA), Rename3 = c("ZaAB", NA, NA, NA, NA), Reason3 = c("split",
NA, NA, NA, NA), Party_Title3 = c(0, NA, NA, NA, NA), alliance3 = c(0,
NA, NA, NA, NA), member_parties3 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), split3 = c(1,
NA, NA, NA, NA), parent_party3 = c("LZJ-PS", NA, NA, NA,
NA), merger3 = c(0, NA, NA, NA, NA), child_party3 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), successor3 = c(0, NA, NA, NA, NA), predecessor3 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), election1 = structure(c(15309, 16740, 11839, 6390, 17956
), class = "Date"), election2 = structure(c(16252, NA, NA,
NA, NA), class = "Date"), election3 = structure(c(16344,
NA, NA, NA, NA), class = "Date")), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
I would like the data to follow a "long" structure where each party_id and the constant factors are repeated 11 times and there are single columns for the time changing factors. Following the top-rated answer formulated here I tried different variations of the following command:
pivot_longer(cols = starts_with(c("election", "Date", "Rename", "Reason", "Party_Title",
"alliance", "member_parties", "split", "parent_party",
"merger", "child_party", "successor", "predecessor")),
names_to = c(".value", "election", "Date", "Rename", "Reason", "Party_Title",
"alliance", "member_parties", "split", "parent_party",
"merger", "child_party", "successor", "predecessor"), names_sep = "_") %>%
select(-matches("election[1-9]"), -matches("Date[1-9]"), -matches("Rename[1-9]"),
-matches("Reason[1-9]"), -matches("alliance[1-9]"), -matches("member_parties[1-9]"),
-matches("split[1-9]"), -matches("parent_party[1-9]"), -matches("merger[1-9]"),
-matches("child_party[1-9]"), -matches("successor[1-9]"), -matches("predecessor[1-9]"),
-matches("Party_Title[1-9]"), -matches("election1[0-2]"), -matches("Date1[0-2]"), -matches("Rename1[0-2]"),
-matches("Reason1[0-2]"), -matches("alliance1[0-2]"), -matches("member_parties1[0-2]"),
-matches("split1[0-2]"), -matches("parent_party1[0-2]"), -matches("merger1[0-2]"),
-matches("child_party1[0-2]"), -matches("successor1[0-2]"), -matches("predecessor1[0-2]"),
-matches("Party_Title1[0-2]"))
However, for some reason, I get a lot of missing values and do not achieve the shape of the data I would like to have. I'd appreciate any hint if you have an idea of how to do this. Thanks!
Update:
I would like the final output to look something like:
structure(list(Party_Name_Short = c("LZJ-PS", "ZiZi", "MNR",
"MDP", "E200", "LZJ-PS", "ZiZi", "MNR", "MDP", "E200", "LZJ-PS",
"ZiZi", "MNR", "MDP", "E200"), Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200", "Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200", "Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200"), Country = c("SVN", "HRV", "FRA", "PRT",
"EST", "SVN", "HRV", "FRA", "PRT", "EST", "SVN", "HRV", "FRA",
"PRT", "EST"), Party_ID = c(1987, 2612, 1263, 1281, 2720, 1987,
2612, 1263, 1281, 2720, 1987, 2612, 1263, 1281, 2720), Party_in_orig_title = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), time = c(1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3), Date = c(2011, NA, 1999,
1987, NA, 2012, NA, NA, NA, NA, 2014, NA, NA, NA, NA), Rename = c("Lista Zorana Jankovica – Pozitivna Slovenija",
NA, "Mouvement national républicain", "ID", NA, "Pozitivna Slovenija",
NA, NA, NA, NA, "ZaAB", NA, NA, NA, NA), Reason = c("foundation",
NA, "split from FN", "split", NA, "renamed", NA, NA, NA, NA,
"split", NA, NA, NA, NA), Party_Title = c(0, NA, 0, 0, NA, 0,
NA, NA, NA, NA, 0, NA, NA, NA, NA), alliance = c(0, NA, 0, 0,
NA, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA), member_parties = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), split = c(0,
NA, 1, 1, NA, 0, NA, NA, NA, NA, 1, NA, NA, NA, NA), parent_party = c(NA,
NA, "FN", "MDP", NA, NA, NA, NA, NA, NA, "LZJ-PS", NA, NA, NA,
NA), merger = c(0, NA, 0, 0, NA, 0, NA, NA, NA, NA, 0, NA, NA,
NA, NA), child_party = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), successor = c(0, NA, 0, 0, NA, 0, NA,
NA, NA, NA, 0, NA, NA, NA, NA), predecessor = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), election = structure(c(1322697600,
1446336000, 1022889600, 552096000, 1551398400, 1404172800, NA,
NA, NA, NA, 1412121600, NA, NA, NA, NA), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -15L), class = c("tbl_df",
"tbl", "data.frame"))
Notice: the newly added time
column and notice that this is only for example purposes, with three time changing factors, whereas in fact there are 11 in the data.
Using pivot_longer
with names_sep
to split between a non-digit and a digit at the end of the string
library(tidyr)
library(dplyr)
df1 %>%
pivot_longer(cols = matches('\\d+$'), names_to = c(".value", 'time'),
names_sep="(?<=\\D)(?=\\d+$)") %>%
arrange(time)
# A tibble: 15 x 19
# Party_Name_Short Party_Name Country Party_ID Party_in_orig_t… time Date Rename Reason Party_Title alliance member_parties split
# <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
# 1 LZJ-PS Lista Zor… SVN 1987 0 1 2011 Lista… found… 0 0 <NA> 0
# 2 ZiZi Živi zid HRV 2612 0 1 NA <NA> <NA> NA NA <NA> NA
# 3 MNR Mouvement… FRA 1263 0 1 1999 Mouve… split… 0 0 <NA> 1
# 4 MDP Movimento… PRT 1281 0 1 1987 ID split 0 0 <NA> 1
# 5 E200 Erakond E… EST 2720 0 1 NA <NA> <NA> NA NA <NA> NA
# 6 LZJ-PS Lista Zor… SVN 1987 0 2 2012 Pozit… renam… 0 0 <NA> 0
# 7 ZiZi Živi zid HRV 2612 0 2 NA <NA> <NA> NA NA <NA> NA
# 8 MNR Mouvement… FRA 1263 0 2 NA <NA> <NA> NA NA <NA> NA
# 9 MDP Movimento… PRT 1281 0 2 NA <NA> <NA> NA NA <NA> NA
#10 E200 Erakond E… EST 2720 0 2 NA <NA> <NA> NA NA <NA> NA
#11 LZJ-PS Lista Zor… SVN 1987 0 3 2014 ZaAB split 0 0 <NA> 1
#12 ZiZi Živi zid HRV 2612 0 3 NA <NA> <NA> NA NA <NA> NA
#13 MNR Mouvement… FRA 1263 0 3 NA <NA> <NA> NA NA <NA> NA
#14 MDP Movimento… PRT 1281 0 3 NA <NA> <NA> NA NA <NA> NA
#15 E200 Erakond E… EST 2720 0 3 NA <NA> <NA> NA NA <NA> NA
# … with 6 more variables: parent_party <chr>, merger <dbl>, child_party <chr>, successor <dbl>, predecessor <chr>, election <date>
这篇关于R使用pivot_longer将名称值对从宽到长整形的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!