如何使用R在提取两个特定模式之前提取另一个特定模式? [英] How to extract two specific patterns before another specific pattern using R?
本文介绍了如何使用R在提取两个特定模式之前提取另一个特定模式?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!
问题描述
我正在尝试提取字母(应该是K或Y)以及该字母和模式之间的所有数字(XO44_TMT6)
,并将提取的值放在两个单独的列(Mod.residue
和Mod.position.in.pep
)中,但无法获得我想要的内容。
下面是我的代码和数据框。谁能解释一下我的代码失败的原因以及如何修复它?
非常感谢!
我的数据框:
structure(list(Modifications = c("Y9(XO44_TMT6)", "Y9(XO44_TMT6)",
"Y9(XO44_TMT6)", "Y9(XO44_TMT6)", "Y9(XO44_TMT6)", "Y9(XO44_TMT6)",
"Y9(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)",
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)",
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)",
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)",
"Y8(XO44_TMT6)", "Y7(XO44_TMT6); M9(Oxidation)", "Y7(XO44_TMT6); M8(Oxidation)",
"Y7(XO44_TMT6); M8(Oxidation)", "Y7(XO44_TMT6); C9(Carbamidomethyl); C18(Carbamidomethyl)",
"Y7(XO44_TMT6); C15(Carbamidomethyl)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)",
"Y6(XO44_TMT6); C23(Carbamidomethyl)", "Y6(XO44_TMT6); C12(Carbamidomethyl)",
"Y6(XO44_TMT6); C12(Carbamidomethyl)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)",
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)",
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)",
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)",
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)",
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)",
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)",
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y4(XO44_TMT6); C29(Carbamidomethyl)",
"Y4(XO44_TMT6); C13(Carbamidomethyl)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)",
"Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)",
"Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)",
"Y4(XO44_TMT6)", "Y3(XO44_TMT6); M5(Oxidation)", "Y3(XO44_TMT6); C11(Carbamidomethyl)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)",
"Y29(XO44_TMT6)", "Y25(XO44_TMT6)", "Y25(XO44_TMT6)", "Y25(XO44_TMT6)",
"Y23(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)",
"Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)",
"Y21(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)",
"Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)",
"Y20(XO44_TMT6)", "Y2(XO44_TMT6); C8(Carbamidomethyl)", "Y2(XO44_TMT6); C19(Carbamidomethyl)",
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6); C15(Carbamidomethyl)",
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6); C15(Carbamidomethyl)",
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)",
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)",
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)",
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)",
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y19(XO44_TMT6)", "Y19(XO44_TMT6)",
"Y19(XO44_TMT6)", "Y19(XO44_TMT6)", "Y19(XO44_TMT6)", "Y18(XO44_TMT6)",
"Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y18(XO44_TMT6)",
"Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y17(XO44_TMT6)", "Y17(XO44_TMT6)",
"Y17(XO44_TMT6)", "Y17(XO44_TMT6)", "Y17(XO44_TMT6)", "Y16(XO44_TMT6)",
"Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)",
"Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)",
"Y16(XO44_TMT6)", "Y15(XO44_TMT6); C16(Carbamidomethyl)", "Y15(XO44_TMT6)",
"Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)",
"Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)",
"Y14(XO44_TMT6); C15(Carbamidomethyl)", "Y14(XO44_TMT6); C15(Carbamidomethyl)",
"Y14(XO44_TMT6)", "Y14(XO44_TMT6)", "Y14(XO44_TMT6)", "Y13(XO44_TMT6)",
"Y13(XO44_TMT6)", "Y13(XO44_TMT6)", "Y12(XO44_TMT6); C14(Carbamidomethyl)",
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)",
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)",
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)",
"Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)",
"Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y10(XO44_TMT6)",
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)",
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)",
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y1(XO44_TMT6); C9(Carbamidomethyl)",
"Y1(XO44_TMT6); C15(Carbamidomethyl)", "Y1(XO44_TMT6); C15(Carbamidomethyl)",
"Y1(XO44_TMT6); C15(Carbamidomethyl)", "Y1(XO44_TMT6); C15(Carbamidomethyl)",
"Y1(XO44_TMT6); C11(Carbamidomethyl)", "Y1(XO44_TMT6); C11(Carbamidomethyl)",
"Y1(XO44_TMT6); C11(Carbamidomethyl)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)",
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)",
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)",
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)",
"Y1(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)",
"N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)",
"N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y8(XO44_TMT6)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)",
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6)",
"N-Term(Prot)(Met-loss+Acetyl); K4(XO44_TMT6)", "N-Term(Prot)(Met-loss); Y8(XO44_TMT6)",
"N-Term(Prot)(Met-loss); Y8(XO44_TMT6)", "N-Term(Prot)(Met-loss); Y8(XO44_TMT6)",
"N-Term(Prot)(Met-loss); Y8(XO44_TMT6)"), Mod.residue = c("9",
"9", "9", "9", "9", "9", "9", "8", "8", "8", "8", "8", "8", "8",
"8", "8", "8", "8", "8", "8", "8", "8", "8", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6",
"6", "6", "6", "5", "5", "5", "5", "5", "5", "5", "5", "5", "5",
"5", "5", "5", "5", "5", "5", "5", "4", "4", "4", "4", "4", "4",
"4", "4", "4", "4", "4", "4", "4", "3", "3", "3", "3", "3", "3",
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3",
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "9", "5",
"5", "5", "3", "2", "2", "2", "2", "2", "2", "2", "1", "0", "0",
"0", "0", "0", "0", "0", "0", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "9", "9", "9", "9", "9", "8", "8", "8", "8", "8",
"8", "8", "7", "7", "7", "7", "7", "6", "6", "6", "6", "6", "6",
"6", "6", "6", "6", "5", "5", "5", "5", "5", "5", "5", "5", "5",
"5", "4", "4", "4", "4", "4", "3", "3", "3", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "1", "1", "1", "1",
"1", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "9",
"9", "9", "9", "8", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "4", "8", "8", "8", "8"
), Mod.position.in.pep = c("", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "")), row.names = c(NA, -300L), class = "data.frame")
我的代码:
df <- df.test %>%
mutate(Mod.residue = gsub(".*(\w{1})\d*\(XO44_TMT6)\;*\s*.*", "\1", Modifications),
Mod.position.in.pep = gsub(".*\w{1}(\d*)\(XO44_TMT6\)\;*\s*.*", "\1", Modifications)
)
推荐答案
我认为您正在寻找的tidyr::extract
在单个函数调用中完全符合您的目的。
library(tidyr)
output_extract<-df %>%
extract(Modifications,
into = c('Mod.residue', 'Mod.position.in.pep'),
regex = ".*([A-Z])(\d+)(?=\(XO44_TMT6\)).*",
remove=FALSE)
如果您想继续使用gsub
,可以这样做(相同的模式,两个不同的替换(\1
和\2
)):
output_gsub<-df %>% mutate(Mod.residue=gsub(".*([A-Z])(\d+)(\(XO44_TMT6\)).*", "\1", Modifications),
Mod.position.in.pep=gsub(".*([A-Z])(\d+)(\(XO44_TMT6\)).*", "\2", Modifications))
这些方法产生相同的输出:
identical(output_extract, output_gsub)
[1] TRUE
您可能希望在以后使用as.numeric
将";mod.postion.in.pep";变量转换为数字。
输出列的唯一值:
$Mod.residue
[1] "Y" "K"
$Mod.position.in.pep
[1] "9" "8" "7" "6" "5" "4" "3" "29" "25" "23" "22" "21" "20" "2" "19" "18" "17" "16" "15" "14" "13" "12" "11" "10" "1"
编辑
仅当";mod.resig";或";mod.postion.in.pep";只有一个匹配时才有效。
如果每个观察有多个[KY]digit(XO44_TMT6)
&q;,则可能必须采用更复杂的方法,mutate %>% unnest_wider %>% unite
#Example data
df<-tibble(Modifications="K4(XO44_TMT6); Y6(XO44_TMT6)")
#solution
library(dplyr)
library(tidyr)
library(stringr)
df %>% mutate(Mod.residue=str_extract_all(Modifications, "[A-Z]+(?=\d+\(XO44_TMT6\))"),
Mod.position.in.pep=str_extract_all(Modifications, "\d+(?=\(XO44_TMT6\))"))%>%
unnest_wider(col='Mod.residue', names_sep = "_")%>%
unnest_wider(col='Mod.position.in.pep', names_sep = "_")%>%
unite(starts_with('Mod.residue'), col="Mod.residue", sep = ';', remove=TRUE, na.rm=TRUE)%>%
unite(starts_with('Mod.position'), col='Mod.position.in.pep', sep=';', remove=TRUE, na.rm=TRUE)
输出
# A tibble: 1 x 3
Modifications Mod.residue Mod.position.in.pep
<chr> <chr> <chr>
1 K4(XO44_TMT6); Y6(XO44_TMT6) K;Y 4;6
这篇关于如何使用R在提取两个特定模式之前提取另一个特定模式?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!
查看全文