如何使用R在提取两个特定模式之前提取另一个特定模式? [英] How to extract two specific patterns before another specific pattern using R?

查看:20
本文介绍了如何使用R在提取两个特定模式之前提取另一个特定模式?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在尝试提取字母(应该是K或Y)以及该字母和模式之间的所有数字(XO44_TMT6),并将提取的值放在两个单独的列(Mod.residueMod.position.in.pep)中,但无法获得我想要的内容。

下面是我的代码和数据框。谁能解释一下我的代码失败的原因以及如何修复它?

非常感谢!

我的数据框:

structure(list(Modifications = c("Y9(XO44_TMT6)", "Y9(XO44_TMT6)", 
"Y9(XO44_TMT6)", "Y9(XO44_TMT6)", "Y9(XO44_TMT6)", "Y9(XO44_TMT6)", 
"Y9(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", 
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", 
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", 
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", 
"Y8(XO44_TMT6)", "Y7(XO44_TMT6); M9(Oxidation)", "Y7(XO44_TMT6); M8(Oxidation)", 
"Y7(XO44_TMT6); M8(Oxidation)", "Y7(XO44_TMT6); C9(Carbamidomethyl); C18(Carbamidomethyl)", 
"Y7(XO44_TMT6); C15(Carbamidomethyl)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y6(XO44_TMT6); C23(Carbamidomethyl)", "Y6(XO44_TMT6); C12(Carbamidomethyl)", 
"Y6(XO44_TMT6); C12(Carbamidomethyl)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", 
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", 
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", 
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", 
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", 
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", 
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", 
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y4(XO44_TMT6); C29(Carbamidomethyl)", 
"Y4(XO44_TMT6); C13(Carbamidomethyl)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", 
"Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", 
"Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", 
"Y4(XO44_TMT6)", "Y3(XO44_TMT6); M5(Oxidation)", "Y3(XO44_TMT6); C11(Carbamidomethyl)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y29(XO44_TMT6)", "Y25(XO44_TMT6)", "Y25(XO44_TMT6)", "Y25(XO44_TMT6)", 
"Y23(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", 
"Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", 
"Y21(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", 
"Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", 
"Y20(XO44_TMT6)", "Y2(XO44_TMT6); C8(Carbamidomethyl)", "Y2(XO44_TMT6); C19(Carbamidomethyl)", 
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6); C15(Carbamidomethyl)", 
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6); C15(Carbamidomethyl)", 
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", 
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", 
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", 
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", 
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y19(XO44_TMT6)", "Y19(XO44_TMT6)", 
"Y19(XO44_TMT6)", "Y19(XO44_TMT6)", "Y19(XO44_TMT6)", "Y18(XO44_TMT6)", 
"Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y18(XO44_TMT6)", 
"Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y17(XO44_TMT6)", "Y17(XO44_TMT6)", 
"Y17(XO44_TMT6)", "Y17(XO44_TMT6)", "Y17(XO44_TMT6)", "Y16(XO44_TMT6)", 
"Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", 
"Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", 
"Y16(XO44_TMT6)", "Y15(XO44_TMT6); C16(Carbamidomethyl)", "Y15(XO44_TMT6)", 
"Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", 
"Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", 
"Y14(XO44_TMT6); C15(Carbamidomethyl)", "Y14(XO44_TMT6); C15(Carbamidomethyl)", 
"Y14(XO44_TMT6)", "Y14(XO44_TMT6)", "Y14(XO44_TMT6)", "Y13(XO44_TMT6)", 
"Y13(XO44_TMT6)", "Y13(XO44_TMT6)", "Y12(XO44_TMT6); C14(Carbamidomethyl)", 
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", 
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", 
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", 
"Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)", 
"Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y10(XO44_TMT6)", 
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", 
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", 
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y1(XO44_TMT6); C9(Carbamidomethyl)", 
"Y1(XO44_TMT6); C15(Carbamidomethyl)", "Y1(XO44_TMT6); C15(Carbamidomethyl)", 
"Y1(XO44_TMT6); C15(Carbamidomethyl)", "Y1(XO44_TMT6); C15(Carbamidomethyl)", 
"Y1(XO44_TMT6); C11(Carbamidomethyl)", "Y1(XO44_TMT6); C11(Carbamidomethyl)", 
"Y1(XO44_TMT6); C11(Carbamidomethyl)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", 
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", 
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", 
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", 
"Y1(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", 
"N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", 
"N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y8(XO44_TMT6)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6)", 
"N-Term(Prot)(Met-loss+Acetyl); K4(XO44_TMT6)", "N-Term(Prot)(Met-loss); Y8(XO44_TMT6)", 
"N-Term(Prot)(Met-loss); Y8(XO44_TMT6)", "N-Term(Prot)(Met-loss); Y8(XO44_TMT6)", 
"N-Term(Prot)(Met-loss); Y8(XO44_TMT6)"), Mod.residue = c("9", 
"9", "9", "9", "9", "9", "9", "8", "8", "8", "8", "8", "8", "8", 
"8", "8", "8", "8", "8", "8", "8", "8", "8", "7", "7", "7", "7", 
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", 
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", 
"7", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", 
"6", "6", "6", "5", "5", "5", "5", "5", "5", "5", "5", "5", "5", 
"5", "5", "5", "5", "5", "5", "5", "4", "4", "4", "4", "4", "4", 
"4", "4", "4", "4", "4", "4", "4", "3", "3", "3", "3", "3", "3", 
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", 
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "9", "5", 
"5", "5", "3", "2", "2", "2", "2", "2", "2", "2", "1", "0", "0", 
"0", "0", "0", "0", "0", "0", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "9", "9", "9", "9", "9", "8", "8", "8", "8", "8", 
"8", "8", "7", "7", "7", "7", "7", "6", "6", "6", "6", "6", "6", 
"6", "6", "6", "6", "5", "5", "5", "5", "5", "5", "5", "5", "5", 
"5", "4", "4", "4", "4", "4", "3", "3", "3", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "1", "1", "1", "1", 
"1", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", 
"0", "0", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "9", 
"9", "9", "9", "8", "7", "7", "7", "7", "7", "7", "7", "7", "7", 
"7", "7", "7", "7", "7", "7", "7", "7", "4", "8", "8", "8", "8"
), Mod.position.in.pep = c("", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "")), row.names = c(NA, -300L), class = "data.frame")

我的代码:

df <- df.test %>% 
  mutate(Mod.residue = gsub(".*(\w{1})\d*\(XO44_TMT6)\;*\s*.*", "\1", Modifications),
         Mod.position.in.pep = gsub(".*\w{1}(\d*)\(XO44_TMT6\)\;*\s*.*", "\1", Modifications)
           )

推荐答案

我认为您正在寻找的tidyr::extract在单个函数调用中完全符合您的目的。

library(tidyr)

output_extract<-df %>%
        extract(Modifications,
               into = c('Mod.residue', 'Mod.position.in.pep'),
               regex = ".*([A-Z])(\d+)(?=\(XO44_TMT6\)).*",
               remove=FALSE)

如果您想继续使用gsub,可以这样做(相同的模式,两个不同的替换(\1\2)):

output_gsub<-df %>% mutate(Mod.residue=gsub(".*([A-Z])(\d+)(\(XO44_TMT6\)).*", "\1", Modifications),
              Mod.position.in.pep=gsub(".*([A-Z])(\d+)(\(XO44_TMT6\)).*", "\2", Modifications))

这些方法产生相同的输出:

identical(output_extract, output_gsub)

[1] TRUE

您可能希望在以后使用as.numeric将";mod.postion.in.pep";变量转换为数字。

输出列的唯一值:

$Mod.residue
[1] "Y" "K"

$Mod.position.in.pep
 [1] "9"  "8"  "7"  "6"  "5"  "4"  "3"  "29" "25" "23" "22" "21" "20" "2"  "19" "18" "17" "16" "15" "14" "13" "12" "11" "10" "1" 

编辑

仅当";mod.resig";或";mod.postion.in.pep";只有一个匹配时才有效。

如果每个观察有多个[KY]digit(XO44_TMT6)&q;,则可能必须采用更复杂的方法,mutate %>% unnest_wider %>% unite

#Example data
df<-tibble(Modifications="K4(XO44_TMT6); Y6(XO44_TMT6)")

#solution
library(dplyr)
library(tidyr)
library(stringr)

df %>% mutate(Mod.residue=str_extract_all(Modifications, "[A-Z]+(?=\d+\(XO44_TMT6\))"),
              Mod.position.in.pep=str_extract_all(Modifications, "\d+(?=\(XO44_TMT6\))"))%>%
        unnest_wider(col='Mod.residue', names_sep = "_")%>%
        unnest_wider(col='Mod.position.in.pep', names_sep = "_")%>%
        unite(starts_with('Mod.residue'), col="Mod.residue", sep = ';', remove=TRUE, na.rm=TRUE)%>%
        unite(starts_with('Mod.position'), col='Mod.position.in.pep', sep=';', remove=TRUE, na.rm=TRUE)

输出

# A tibble: 1 x 3
  Modifications                Mod.residue Mod.position.in.pep
  <chr>                        <chr>       <chr>              
1 K4(XO44_TMT6); Y6(XO44_TMT6) K;Y         4;6 

这篇关于如何使用R在提取两个特定模式之前提取另一个特定模式?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆