如何通过迭代方法或管道运算符从嵌套数据集中删除异常值行 [英] How to delete rows of outliers rom a nested dataset via an iterative method or pipe operator

查看：18 发布时间：2022/1/24 11:55:30 r loops if-statement iteration outliers

本文介绍了如何通过迭代方法或管道运算符从嵌套数据集中删除异常值行的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我正在尝试从这个嵌套数据集中删除异常值

df_join# 一个小标题:12 x 2# 组:信号 [12]信号数据<chr><列表>1 P3FCz <tibble [75 x 5]>2 P3Cz <tibble [75 x 5]>3 P3Pz <tibble [75 x 5]>4 LPPearlyFCz<tibble [75 x 5]>5 LPPearlyCz <tibble [75 x 5]>6 LPPearlyPz <tibble [75 x 5]>7 LPP1FCz<tibble [75 x 5]>8 LPP1Cz <tibble [75 x 5]>9 LPP1Pz <tibble [75 x 5]>10 LPP2FCz<tibble [75 x 5]>11 LPP2Cz <tibble [75 x 5]>12 LPP2Pz <tibble [75 x 5]>

例如，它的第一个元素包含这一系列变量:

df_join[[2]][[1]]# 小标题:75 x 5ID GR SES COND 值<事实><事实><事实><事实><dbl>1 01 RP V NEG-CTR -11.62 01 RP V NEG-NOC -11.13 01 RP V NEU-NOC -4.004 04 RP V NEG-CTR -0.3145 04 RP V NEG-NOC 0.2396 04 RP V NEU-NOC 5.047 06 RP V NEG-CTR -0.2148 06 RP V NEG-NOC -2.969 06 RP V NEU-NOC -1.9710 07 RP V NEG-CTR -2.83

全部内容如下:

<代码>>输入(头(df_join))结构(列表(信号 = c(P3FCz"，P3Cz"，P3Pz"，LPPearlyFCz"，LPPearlyCz"，LPPearlyPz")，数据=列表(结构(列表(ID =结构(c(1L，1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L,6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L,11L, 12L, 12L, 12L, 13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L,16L, 16L, 16L, 17L, 17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L,20L, 20L, 21L, 21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L,24L, 25L, 25L, 25L), .Label = c(01", 04", 06", 07", 08",09"、10"、11"、12"、13"、15"、16"、17"、18"、19"、21"，22"、23"、25"、27"、28"、30"、44"、46"、49")，类别 = 因素")，GR =结构(c(1L，1L，1L，1L，1L，1L，1L，1L，1L，1L，1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), SES = structure(c(1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"),条件 = 结构(c(1L，2L，3L，1L，2L，3L，1L，2L，3L，1L，2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L), .Label = c(NEG-CTR", NEG-NOC", NEU-NOC"), 类 = 因子"), 值 = c(-11.6312151716924, -11.1438413285935,-3.99591470944713，-0.314155675382471，0.238885648959708，5.03749946898385，-0.213621915029167，-2.96032491743069，-1.97168681693488, -2.83109425298642, 1.09291198163802, -6.692991645215,4.23849942428043、2.9898889629932、3.5510699900835、9.57481668808606、5.4167795618285、1.7067607715475、-6.13036076093477、-2.82955734597919、-2.50672211111696、0.528517585832501、8.16418133488309、1.88777321897925、-7.73588468896919，-9.83058052401056，-6.97442700196932，1.27327945355082、2.11962397764132、0.524299677616254、-1.83310726842883、0.658810483381172, -0.261373488428192, 4.37524298634374,0.625555654900511, 3.19617639836154, 0.0405517582137798,-3.29357103412113，-0.381435057304614，-5.73445509910268，-6.1129152355645, -2.45744234877604, 2.95352732001065, 0.527721249096473,1.91803490989119，-3.46703346467546，-2.40438419043702，-5.35374408162217，-7.27028665849262，-7.1532211375959，-5.39955520296854，2.65765002364624，0.372495441513391、6.24433066412776、1.85698518142405、-0.564454675803529、-0.068523080368053，-7.04782633579147，-4.52263283590558，-6.62134671432544、4.56661945182626、3.05859761335498、2.02997952225347、-6.10523962206958，-0.521871236969702，-3.97851995684846，-2.61258020387919，-4.13974828699279，-3.9210032516844，-4.63162466544638，-4.36762718685405，-6.71005969834916，-4.22719611676328，-0.229916506217565, -5.69725200870146)), 类 = c(tbl_df",tbl"，data.frame")，row.names = c(NA，-75L))，结构(列表(ID =结构(c(1L，1L，1L，2L，2L，2L，3L，3L，3L，4L，4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L,9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L,13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L,17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L,21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L,25L, 25L), .Label = c(01", 04", 06", 07", 08", 09",10"、11"、12"、13"、15"、16"、17"、18"、19"、21"、22"、23"、25"、27"、28"、30"、44"、46"、49")，类别 = 因素")，GR =结构(c(1L，1L，1L，1L，1L，1L，1L，1L，1L，1L，1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), SES = structure(c(1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"),条件 = 结构(c(1L，2L，3L，1L，2L，3L，1L，2L，3L，1L，2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L), .Label = c(NEG-CTR", NEG-NOC", NEU-NOC"), 类 = 因子"), 值 = c(-5.16524399006139, -5.53112490175437,0.621502123415388, 2.23100741241039, 3.96990710862955, 7.75899775608441,-1.30019374375434，-3.59899040898949，-1.92340529575071，2.19344184533265, 5.87900720863083, -5.92378937757888, 2.44958531767688,3.10043497883256、1.65779442628225、13.7118233181713、6.86178446511352、5.31481098188172, -4.13240668697805, 0.162182285588285, 0.142083484505352,5.42592103255673、14.5496375672716、4.52018125654081、-2.40677805475299、-5.3832670295207，-1.55736964635117，3.48359241788107，4.23167123533126，2.00051785325202、1.48755216347718、2.37269462739372、1.30346907198835、3.89476490634811, 1.87516303240986, 4.36353100770575, 1.9413417416824,-2.22114447555529，-0.015852062711641，-2.76146409940467，-3.51627712447581、1.01799377568815、1.74783962328435、1.1303870721987、2.16398550183836，-3.31557794753334，-1.83920975041768，-6.06703163736936，-8.1566939611461，-9.23030396302541，-4.35545141573936，0.906302081219897，0.45401759063429、3.80236232314171、4.0336657306528、2.0185967445137、0.835589319243251，-4.6805488231028，-1.20746167339041，-5.50475999427345，4.96594373869991、4.1349308440931、3.00187233307059、-5.61465293602653、0.544596077279702，-5.20450410570445，-0.0325220589039272，-2.28038421035601，-2.01375702882255，-1.6547144697087，-0.619979893871085，-4.48258340054462，-1.42281778522059，2.62315679073783，-4.13736508533355)), class = c(tbl_df", tbl", data.frame"), row.names = c(NA,-75L))，结构(列表(ID = 结构(c(1L，1L，1L，2L，2L，2L，3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L,8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L,13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L,17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L,21L、22L、22L、22L、23L、23L、23L、24L、24L、24L、25L、25L、25L)、.Label = c(01"、04"、06"、07"、08"、09"、10"、11"、12"、13"、15"、16"、17"、18"、19"、21"、22"、23"、25"，27"，28"，30"，44"，46"，49")，类 =因子")，GR =结构(c(1L，1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"),SES = 结构(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), COND = structure(c(1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c(NEG-CTR",NEG-NOC"，NEU-NOC")，类 = 因子")，值 = c(11.8802266972569，12.1053426662461, 12.955441582096, 15.0981004360619, 15.4046229884164,16.671036999147, 3.13771453335467, -0.0892565159000666, 2.15365554736525,13.6778924406572, 14.3862738306396, 6.86762877785576, 7.47946451329025,8.93405130318593, 8.45962311067909, 23.4166601996042, 15.1868092142896,9.97183712753913, 6.267521071803, 10.142198458411, 10.6320358418368,12.9998037913548, 20.7052065690674, 11.8852179570666, 15.7899796085713,7.50729833890206, 14.3076172484818, 9.93797956768228, 10.7693238464384,5.04681800218272, 5.16656503460515, 7.87875085817396, 2.29899409536951,10.0135486953849, 5.48278706243332, 7.81908431468528, 8.64382513728869,3.35777109534179, 3.47474629234488, 4.35678644331281, 3.47085321062162,6.56231512354717, 4.93825547529124, 7.33985613752315, 6.81966900599588,6.54487921689425、7.25872117706077、1.10301223694429、-0.856423579793706、-0.887835692028378，-0.931653372049331，5.6617683754256，2.29939831067085、5.1554825066748、6.59026080217083、3.0741733363644、1.80359068950898, 1.63892755704177, 3.857933716935, 0.769316188513939,10.7031907391191, 9.53278894637555, 8.01071628743378, 6.04891324234645,11.1964453850602, 3.46633322373091, 14.4393884282958, 11.2339563353478,7.74933708914689, 7.1182095475238, 7.39260082121406, 0.627435381320771,9.15473202689768, 13.6559037433263, 7.14786907480758)), class = c("tbl_df",tbl"，data.frame")，row.names = c(NA，-75L))，结构(列表(ID =结构(c(1L，1L，1L，2L，2L，2L，3L，3L，3L，4L，4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L,9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L,13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L,17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L,21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L,25L, 25L), .Label = c(01", 04", 06", 07", 08", 09",10"、11"、12"、13"、15"、16"、17"、18"、19"、21"、22"、23"、25"、27"、28"、30"、44"、46"、49")，类别 = 因素")，GR =结构(c(1L，1L，1L，1L，1L，1L，1L，1L，1L，1L，1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), SES = structure(c(1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"),条件 = 结构(c(1L，2L，3L，1L，2L，3L，1L，2L，3L，1L，2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L), .Label = c(NEG-CTR", NEG-NOC", NEU-NOC"), 类 = 因子"), 值 = c(-11.7785042972793, -9.14927207125904,-7.58190508537766，-4.01515836011381，-6.60165385653499，-2.02861964460179、4.46729570509601、2.54036572774646、2.22923889930115、-0.883620011106743，-2.63569087592267，-2.0629672230873，1.14544537612393、2.08056674659401、0.0422658298956365、13.2986259796748、5.06669915366333、3.93467692474742、0.0229069420708053、4.31923128857779、0.237726051904304、1.89972383690448、3.2371880079134、0.318100791495115、-8.08292381883298，-5.73174008540523，-15.7998485301436，1.75469999857951, 0.677370118816266, -1.8397955509895, 2.55445787016256,-0.380810453692585, 0.62462329496673, 2.61316333850434, 2.68202480583985,1.76690658846479，0.148635887703097，-0.958853757041888，-3.17305964093897，-7.82526758429289，-6.58557573679886，-4.39207076049089, 2.36752476749952, 0.594715760553033, -0.29794568443312,-4.5365387390683, 0.196832250811775, -2.70852853745588, 0.498995124872827,0.165171574219401, 0.269498974991661, 0.901948386281446,-2.45955661653299、1.63525170542944、0.155897732673534、1.8491735212703、-0.856727109535223，-1.16182571974245，1.07658425742917，-2.21433585407388、4.3385479368043、4.40588599635354、0.127710423625772、-6.26956613362656，-1.17658595005389，-7.25886366924741，-0.888293709383838，-2.14177059335841，-2.42141595261389，-2.958120275175，-5.1274001953303，-5.32347488769128，-4.41290818553442，-1.21404719262173, -4.23649270310915)), 类 = c(tbl_df",tbl"，data.frame")，row.names = c(NA，-75L))，结构(列表(ID =结构(c(1L，1L，1L，2L，2L，2L，3L，3L，3L，4L，4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L,9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L,13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L,17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L,21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L,25L, 25L), .Label = c(01", 04", 06", 07", 08", 09",10"、11"、12"、13"、15"、16"、17"、18"、19"、21"、22"、23"、25"、27"、28"、30"、44"、46"、49")，类别 = 因素")，GR =结构(c(1L，1L，1L，1L，1L，1L，1L，1L，1L，1L，1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), SES = structure(c(1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"),条件 = 结构(c(1L，2L，3L，1L，2L，3L，1L，2L，3L，1L，2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L), .Label = c(NEG-CTR", NEG-NOC", NEU-NOC"), 类 = 因子"), 值 = c(-5.96429031525769, -5.10918437158799,-2.81732229625975，-1.43557366487622，-3.14872157912645，0.160393685024631、3.52155765271648、2.10437989449921、2.70693992810407、5.49897156207812、5.81171180245335、-1.37301251388987、-0.434363848460157、2.87987510596148, -1.27152670283348, 17.2093269365993, 7.79412746755931,8.11964589961276、4.95253363860044、9.50695673265293、4.15235381401148、6.1294488368639、8.01447499455337、0.783414018677801、-1.24197194087055、-0.487178595894761，-9.79031812534203，4.22150266269492，4.20139847550095, 0.208005397351335, 4.19096721581768, 0.815283302847055,1.48137456347872、2.0809543999959、4.35199943309111、2.84860039832237、3.05879540677983、2.11976068962167、-0.269002712326028、-2.77155065610474、-2.59002218694999, 0.17928456999128, 2.24515223348079, 1.88805943988563,-0.0920286086411814, -2.00968595029144, 2.59427260100332,-1.27622011197768，0.588399071755827，-1.43982473126936，1.96978732491278，-0.338674980283045，-1.86484698930706，-0.0154791822607025、2.55036185373462、4.42520405730058、-0.599156247027551, 1.60091251589958, 4.7367320574401, -0.192490723623988,4.8452288234686、5.71745745981867、1.02554478706585、-4.5951256708181、1.1704842909792，-7.42770276334892，3.15655538248828，-0.639830772856786，-0.345116641695513，-0.0391030568720636，-2.61585906518491，-2.71685194532693，-1.7348388034111，1.00287124847525，-2.4844653851482)), class = c(tbl_df", tbl", data.frame"), row.names = c(NA,-75L))，结构(列表(ID = 结构(c(1L，1L，1L，2L，2L，2L，3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L,8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L,13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L,17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L,21L、22L、22L、22L、23L、23L、23L、24L、24L、24L、25L、25L、25L)、.Label = c(01"、04"、06"、07"、08"、09"、10"、11"、12"、13"、15"、16"、17"、18"、19"、21"、22"、23"、25"，27"，28"，30"，44"，46"，49")，类 =因子")，GR =结构(c(1L，1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"),SES = 结构(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), COND = structure(c(1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c(NEG-CTR",NEG-NOC"，NEU-NOC")，类 = 因子")，值 = c(8.23981597718437，9.51261484648731, 9.42367409925817, 5.06332653216481, 5.02619159395405,9.07903916629231, 7.56089165217984, 5.49719893790597, 4.91476855238182,13.0320953572069, 10.8414516494484, 5.86927622259489, 3.25309970442897,4.6847880297099、2.71096740085175、25.567439566524、16.3241813617706、13.0990192799703, 11.9200281736866, 14.6901305277101, 9.67397418905514,10.2974302220899, 12.0768070828642, 5.9401530589224, 12.4817579327688,12.419526465857, 1.00612108990875, 9.63063375751153, 10.5631237176538,3.08031473770521、3.35694102903017、4.28046277054405、-0.133592200169464、6.9103658689166, 7.64737651416791, 6.75669517393108, 8.5369185279747,7.08645126073423、4.47409706618326、4.39617687043259、3.27924738047746、6.06169418872804、5.34939694712468、5.58288092654703、4.85729686493463、7.38032829587839, 11.7259526759912, 4.95764559864061, 6.24066579989613,3.49843659402445、4.07498375647916、3.55732294589389、1.33918111568512、0.956782967443242、2.32002496709926、3.15289777246607、-0.832211906889126、6.39254974438057、7.0533787627062、2.97245026797807、6.23573445580928、7.6052386193207, 2.98791225155534, 3.10850022259445, 8.12060882554471,-0.00459651443883508, 13.5899217198075, 9.93070913311253,8.10285456644801、5.04464304009428、2.02262615478956、1.0510618938653、5.62233873107127, 10.1193593084848, 5.87476640145049)), 类 = c(tbl_df",tbl"，data.frame")，row.names = c(NA，-75L))))，class = c(grouped_df"，tbl_df"，tbl"，data.frame")，row.names = c(NA，-6L)，groups =结构(列表(信号 = c(LPPearlyCz"、LPPearlyFCz"、LPPearlyPz"、P3Cz"、P3FCz"，P3Pz")，.rows =结构(列表(5L，4L，6L，2L，1L, 3L), ptype = integer(0), class = c("vctrs_list_of",vctrs_vctr"、list")))、class = c(tbl_df"、tbl"、data.frame"), row.names = c(NA, -6L), .drop = TRUE))>

我尝试如下检查是否存在异常值:

outliers_table <- df_join %>%不嵌套()％>％dplyr::select(COND, 信号, 值) %>%group_by(COND) %>% #相当于时间作为分组变量识别异常值(值)

转身

小数点:30 x 5COND 信号值 is.outlier is.extreme<事实><chr><dbl><lgl><lgl>1 NEG-CTR P3FCz -11.6 真假2 NEG-CTR P3Cz 13.7 真假3 NEG-CTR P3Pz 15.1 真假4 NEG-CTR P3Pz 13.7 真假5 NEG-CTR P3Pz 23.4 TRUE TRUE6 NEG-CTR P3Pz 15.8 真假7 NEG-CTR P3Pz 14.4 真假8 NEG-CTR LPPearlyFCz -11.8 真假9 NEG-CTR LPPearlyCz 17.2 真假10 NEG-CTR LPPearlyPz 25.6 TRUE TRUE

如果我有兴趣删除所有那些真正极端的值，我怎么能通过使用一些迭代函数或一些 if 语句来做呢?请考虑其他替代方案，以防更容易(也通过添加另一个 %>% 命令行来保持我编写的命令)编写一个 for 循环或其他一些函数.

从一开始我就编写了我创建的失败代码:

outliers_bale <- df_join %>%不嵌套()％>％dplyr::select(COND, 信号, 值) %>%group_by(COND) %>% #相当于时间作为分组变量识别异常值(值)%>%过滤器(is.outlier & is.extreme)值 <- 异常值表$值df_join[!(df_join$data %in% 个值), ]

我无法弄清楚它是否有效.

提前致谢

解决方案

好的.让我们一起一步一步来.据我了解，您非常担心在您的数据中(我将其保存在变量 df 中)存在异常值甚至极端值.首先，我们将从您的数据中仅提取一个分组的 tibble 并过滤 COND =="NEG-NOC"

库(tidyverse)图书馆(rstatix)图书馆(异常值)数据 = df$data[[1]] %>% 过滤器(COND==NEG-NOC")

现在让我们考虑我们将使用哪种异常值识别方法.我们可以使用 boxplot 函数.

boxplot.stats(data$value)$out#[1] 8.164181

这很好，但它只会给我们向量形式的异常值.第二种方法是使用identify_outliers.这给了我们一个tibble，但仍然只有那些具有这些异常值的行.

data %>% identify_outliers(variable = "value")# # 一个小标题:1 x 7# ID GR SES COND 值 is.outlier is.extreme#<fct><事实><事实><事实><dbl><lgl><lgl># 1 11 RP V NEG-NOC 8.16 真假

好吧，让我们使用 outliers 包中的 outlier 函数.这可以给我们一个逻辑向量.

离群值(数据$值，相反= T)#[1] 8.164181异常值(数据$值，相反 = T，逻辑 = T)# [1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE#[22] 假 假 假 假

但是，这两种方法都不能帮助您决定如何处理这些异常值.请仔细阅读

As you can see on the chart, there are no outliers anymore!

Now we're ready to plot each signal!

#A function that creates a special boxplot3 and adds it to a data frameAddSignalBoxplot3 = function(df, signal, printPlot=TRUE) {plot1 = SpecBoxplot3(df$data[[1]], signal, TRUE)plot2 = SpecBoxplot3(df$data[[1]], signal, FALSE)if(printPlot) print(plot1)if(printPlot) print(plot2)df %>% mutate(boxplot1 = list(plot1),boxplot2 = list(plot2),)}#Added special boxplot3df %>% group_by(signals) %>%group_modify(~AddSignalBoxplot3(.x, .y))

Good luck on your further analysis !!

Last update

create.plot2 = function(df, group){data = df$data[[1]]minv = min(data$value)maxv = max(data$value)df.stat = data %>% group_by(COND) %>%summarise(n = n(),mean = mean(value),sd = sd(value),min = minv,max = maxv,x = seq(min, max, length.out = n*100),value = dnorm(x, mean, sd))data %>% ggplot(aes(value, fill=COND))+geom_histogram(aes(y=..density..), colour="black", fill="white", bins = 30)+geom_density(alpha=.2, fill="red", col="red")+geom_line(aes(x, value), data=df.stat, col="blue")+xlab(group)+facet_grid(cols = vars(COND))}df %>% group_by(signals) %>%group_map(create.plot2)

I'm trying removing outliers from this nested dataset

df_join
# A tibble: 12 x 2
# Groups:   signals [12]
   signals     data             
   <chr>       <list>           
 1 P3FCz       <tibble [75 x 5]>
 2 P3Cz        <tibble [75 x 5]>
 3 P3Pz        <tibble [75 x 5]>
 4 LPPearlyFCz <tibble [75 x 5]>
 5 LPPearlyCz  <tibble [75 x 5]>
 6 LPPearlyPz  <tibble [75 x 5]>
 7 LPP1FCz     <tibble [75 x 5]>
 8 LPP1Cz      <tibble [75 x 5]>
 9 LPP1Pz      <tibble [75 x 5]>
10 LPP2FCz     <tibble [75 x 5]>
11 LPP2Cz      <tibble [75 x 5]>
12 LPP2Pz      <tibble [75 x 5]>

for instance, the first element of it contains this series of variable:

df_join[[2]][[1]]
# A tibble: 75 x 5
   ID    GR    SES   COND      value
   <fct> <fct> <fct> <fct>     <dbl>
 1 01    RP    V     NEG-CTR -11.6  
 2 01    RP    V     NEG-NOC -11.1  
 3 01    RP    V     NEU-NOC  -4.00 
 4 04    RP    V     NEG-CTR  -0.314
 5 04    RP    V     NEG-NOC   0.239
 6 04    RP    V     NEU-NOC   5.04 
 7 06    RP    V     NEG-CTR  -0.214
 8 06    RP    V     NEG-NOC  -2.96 
 9 06    RP    V     NEU-NOC  -1.97 
10 07    RP    V     NEG-CTR  -2.83

the entire content of it is the following one:

> dput(head(df_join))
structure(list(signals = c("P3FCz", "P3Cz", "P3Pz", "LPPearlyFCz", 
"LPPearlyCz", "LPPearlyPz"), data = list(structure(list(ID = structure(c(1L, 
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 
6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 
11L, 12L, 12L, 12L, 13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 
16L, 16L, 16L, 17L, 17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 
20L, 20L, 21L, 21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 
24L, 25L, 25L, 25L), .Label = c("01", "04", "06", "07", "08", 
"09", "10", "11", "12", "13", "15", "16", "17", "18", "19", "21", 
"22", "23", "25", "27", "28", "30", "44", "46", "49"), class = "factor"), 
    GR = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), SES = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), 
    COND = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L), .Label = c("NEG-CTR", "NEG-NOC", "NEU-NOC"
    ), class = "factor"), value = c(-11.6312151716924, -11.1438413285935, 
    -3.99591470944713, -0.314155675382471, 0.238885648959708, 
    5.03749946898385, -0.213621915029167, -2.96032491743069, 
    -1.97168681693488, -2.83109425298642, 1.09291198163802, -6.692991645215, 
    4.23849942428043, 2.9898889629932, 3.5510699900835, 9.57481668808606, 
    5.4167795618285, 1.7067607715475, -6.13036076093477, -2.82955734597919, 
    -2.50672211111696, 0.528517585832501, 8.16418133488309, 1.88777321897925, 
    -7.73588468896919, -9.83058052401056, -6.97442700196932, 
    1.27327945355082, 2.11962397764132, 0.524299677616254, -1.83310726842883, 
    0.658810483381172, -0.261373488428192, 4.37524298634374, 
    0.625555654900511, 3.19617639836154, 0.0405517582137798, 
    -3.29357103412113, -0.381435057304614, -5.73445509910268, 
    -6.1129152355645, -2.45744234877604, 2.95352732001065, 0.527721249096473, 
    1.91803490989119, -3.46703346467546, -2.40438419043702, -5.35374408162217, 
    -7.27028665849262, -7.1532211375959, -5.39955520296854, 2.65765002364624, 
    0.372495441513391, 6.24433066412776, 1.85698518142405, -0.564454675803529, 
    -0.068523080368053, -7.04782633579147, -4.52263283590558, 
    -6.62134671432544, 4.56661945182626, 3.05859761335498, 2.02997952225347, 
    -6.10523962206958, -0.521871236969702, -3.97851995684846, 
    -2.61258020387919, -4.13974828699279, -3.9210032516844, -4.63162466544638, 
    -4.36762718685405, -6.71005969834916, -4.22719611676328, 
    -0.229916506217565, -5.69725200870146)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -75L)), structure(list(
    ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 
    4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 
    9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L, 
    13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 
    17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 
    21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 
    25L, 25L), .Label = c("01", "04", "06", "07", "08", "09", 
    "10", "11", "12", "13", "15", "16", "17", "18", "19", "21", 
    "22", "23", "25", "27", "28", "30", "44", "46", "49"), class = "factor"), 
    GR = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), SES = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), 
    COND = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L), .Label = c("NEG-CTR", "NEG-NOC", "NEU-NOC"
    ), class = "factor"), value = c(-5.16524399006139, -5.53112490175437, 
    0.621502123415388, 2.23100741241039, 3.96990710862955, 7.75899775608441, 
    -1.30019374375434, -3.59899040898949, -1.92340529575071, 
    2.19344184533265, 5.87900720863083, -5.92378937757888, 2.44958531767688, 
    3.10043497883256, 1.65779442628225, 13.7118233181713, 6.86178446511352, 
    5.31481098188172, -4.13240668697805, 0.162182285588285, 0.142083484505352, 
    5.42592103255673, 14.5496375672716, 4.52018125654081, -2.40677805475299, 
    -5.3832670295207, -1.55736964635117, 3.48359241788107, 4.23167123533126, 
    2.00051785325202, 1.48755216347718, 2.37269462739372, 1.30346907198835, 
    3.89476490634811, 1.87516303240986, 4.36353100770575, 1.9413417416824, 
    -2.22114447555529, -0.015852062711641, -2.76146409940467, 
    -3.51627712447581, 1.01799377568815, 1.74783962328435, 1.1303870721987, 
    2.16398550183836, -3.31557794753334, -1.83920975041768, -6.06703163736936, 
    -8.1566939611461, -9.23030396302541, -4.35545141573936, 0.906302081219897, 
    0.45401759063429, 3.80236232314171, 4.0336657306528, 2.0185967445137, 
    0.835589319243251, -4.6805488231028, -1.20746167339041, -5.50475999427345, 
    4.96594373869991, 4.1349308440931, 3.00187233307059, -5.61465293602653, 
    0.544596077279702, -5.20450410570445, -0.0325220589039272, 
    -2.28038421035601, -2.01375702882255, -1.6547144697087, -0.619979893871085, 
    -4.48258340054462, -1.42281778522059, 2.62315679073783, -4.13736508533355
    )), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-75L)), structure(list(ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 
3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 
8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 
13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 
17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L, 
21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 25L, 25L
), .Label = c("01", "04", "06", "07", "08", "09", "10", "11", 
"12", "13", "15", "16", "17", "18", "19", "21", "22", "23", "25", 
"27", "28", "30", "44", "46", "49"), class = "factor"), GR = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), 
    SES = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), COND = structure(c(1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("NEG-CTR", 
    "NEG-NOC", "NEU-NOC"), class = "factor"), value = c(11.8802266972569, 
    12.1053426662461, 12.955441582096, 15.0981004360619, 15.4046229884164, 
    16.671036999147, 3.13771453335467, -0.0892565159000666, 2.15365554736525, 
    13.6778924406572, 14.3862738306396, 6.86762877785576, 7.47946451329025, 
    8.93405130318593, 8.45962311067909, 23.4166601996042, 15.1868092142896, 
    9.97183712753913, 6.267521071803, 10.142198458411, 10.6320358418368, 
    12.9998037913548, 20.7052065690674, 11.8852179570666, 15.7899796085713, 
    7.50729833890206, 14.3076172484818, 9.93797956768228, 10.7693238464384, 
    5.04681800218272, 5.16656503460515, 7.87875085817396, 2.29899409536951, 
    10.0135486953849, 5.48278706243332, 7.81908431468528, 8.64382513728869, 
    3.35777109534179, 3.47474629234488, 4.35678644331281, 3.47085321062162, 
    6.56231512354717, 4.93825547529124, 7.33985613752315, 6.81966900599588, 
    6.54487921689425, 7.25872117706077, 1.10301223694429, -0.856423579793706, 
    -0.887835692028378, -0.931653372049331, 5.6617683754256, 
    2.29939831067085, 5.1554825066748, 6.59026080217083, 3.0741733363644, 
    1.80359068950898, 1.63892755704177, 3.857933716935, 0.769316188513939, 
    10.7031907391191, 9.53278894637555, 8.01071628743378, 6.04891324234645, 
    11.1964453850602, 3.46633322373091, 14.4393884282958, 11.2339563353478, 
    7.74933708914689, 7.1182095475238, 7.39260082121406, 0.627435381320771, 
    9.15473202689768, 13.6559037433263, 7.14786907480758)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -75L)), structure(list(
    ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 
    4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 
    9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L, 
    13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 
    17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 
    21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 
    25L, 25L), .Label = c("01", "04", "06", "07", "08", "09", 
    "10", "11", "12", "13", "15", "16", "17", "18", "19", "21", 
    "22", "23", "25", "27", "28", "30", "44", "46", "49"), class = "factor"), 
    GR = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), SES = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), 
    COND = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L), .Label = c("NEG-CTR", "NEG-NOC", "NEU-NOC"
    ), class = "factor"), value = c(-11.7785042972793, -9.14927207125904, 
    -7.58190508537766, -4.01515836011381, -6.60165385653499, 
    -2.02861964460179, 4.46729570509601, 2.54036572774646, 2.22923889930115, 
    -0.883620011106743, -2.63569087592267, -2.0629672230873, 
    1.14544537612393, 2.08056674659401, 0.0422658298956365, 13.2986259796748, 
    5.06669915366333, 3.93467692474742, 0.0229069420708053, 4.31923128857779, 
    0.237726051904304, 1.89972383690448, 3.2371880079134, 0.318100791495115, 
    -8.08292381883298, -5.73174008540523, -15.7998485301436, 
    1.75469999857951, 0.677370118816266, -1.8397955509895, 2.55445787016256, 
    -0.380810453692585, 0.62462329496673, 2.61316333850434, 2.68202480583985, 
    1.76690658846479, 0.148635887703097, -0.958853757041888, 
    -3.17305964093897, -7.82526758429289, -6.58557573679886, 
    -4.39207076049089, 2.36752476749952, 0.594715760553033, -0.29794568443312, 
    -4.5365387390683, 0.196832250811775, -2.70852853745588, 0.498995124872827, 
    0.165171574219401, 0.269498974991661, 0.901948386281446, 
    -2.45955661653299, 1.63525170542944, 0.155897732673534, 1.8491735212703, 
    -0.856727109535223, -1.16182571974245, 1.07658425742917, 
    -2.21433585407388, 4.3385479368043, 4.40588599635354, 0.127710423625772, 
    -6.26956613362656, -1.17658595005389, -7.25886366924741, 
    -0.888293709383838, -2.14177059335841, -2.42141595261389, 
    -2.958120275175, -5.1274001953303, -5.32347488769128, -4.41290818553442, 
    -1.21404719262173, -4.23649270310915)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -75L)), structure(list(
    ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 
    4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 
    9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L, 
    13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 
    17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 
    21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 
    25L, 25L), .Label = c("01", "04", "06", "07", "08", "09", 
    "10", "11", "12", "13", "15", "16", "17", "18", "19", "21", 
    "22", "23", "25", "27", "28", "30", "44", "46", "49"), class = "factor"), 
    GR = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), SES = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), 
    COND = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L), .Label = c("NEG-CTR", "NEG-NOC", "NEU-NOC"
    ), class = "factor"), value = c(-5.96429031525769, -5.10918437158799, 
    -2.81732229625975, -1.43557366487622, -3.14872157912645, 
    0.160393685024631, 3.52155765271648, 2.10437989449921, 2.70693992810407, 
    5.49897156207812, 5.81171180245335, -1.37301251388987, -0.434363848460157, 
    2.87987510596148, -1.27152670283348, 17.2093269365993, 7.79412746755931, 
    8.11964589961276, 4.95253363860044, 9.50695673265293, 4.15235381401148, 
    6.1294488368639, 8.01447499455337, 0.783414018677801, -1.24197194087055, 
    -0.487178595894761, -9.79031812534203, 4.22150266269492, 
    4.20139847550095, 0.208005397351335, 4.19096721581768, 0.815283302847055, 
    1.48137456347872, 2.0809543999959, 4.35199943309111, 2.84860039832237, 
    3.05879540677983, 2.11976068962167, -0.269002712326028, -2.77155065610474, 
    -2.59002218694999, 0.17928456999128, 2.24515223348079, 1.88805943988563, 
    -0.0920286086411814, -2.00968595029144, 2.59427260100332, 
    -1.27622011197768, 0.588399071755827, -1.43982473126936, 
    1.96978732491278, -0.338674980283045, -1.86484698930706, 
    -0.0154791822607025, 2.55036185373462, 4.42520405730058, 
    -0.599156247027551, 1.60091251589958, 4.7367320574401, -0.192490723623988, 
    4.8452288234686, 5.71745745981867, 1.02554478706585, -4.5951256708181, 
    1.1704842909792, -7.42770276334892, 3.15655538248828, -0.639830772856786, 
    -0.345116641695513, -0.0391030568720636, -2.61585906518491, 
    -2.71685194532693, -1.7348388034111, 1.00287124847525, -2.4844653851482
    )), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-75L)), structure(list(ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 
3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 
8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 
13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 
17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L, 
21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 25L, 25L
), .Label = c("01", "04", "06", "07", "08", "09", "10", "11", 
"12", "13", "15", "16", "17", "18", "19", "21", "22", "23", "25", 
"27", "28", "30", "44", "46", "49"), class = "factor"), GR = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), 
    SES = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), COND = structure(c(1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("NEG-CTR", 
    "NEG-NOC", "NEU-NOC"), class = "factor"), value = c(8.23981597718437, 
    9.51261484648731, 9.42367409925817, 5.06332653216481, 5.02619159395405, 
    9.07903916629231, 7.56089165217984, 5.49719893790597, 4.91476855238182, 
    13.0320953572069, 10.8414516494484, 5.86927622259489, 3.25309970442897, 
    4.6847880297099, 2.71096740085175, 25.567439566524, 16.3241813617706, 
    13.0990192799703, 11.9200281736866, 14.6901305277101, 9.67397418905514, 
    10.2974302220899, 12.0768070828642, 5.9401530589224, 12.4817579327688, 
    12.419526465857, 1.00612108990875, 9.63063375751153, 10.5631237176538, 
    3.08031473770521, 3.35694102903017, 4.28046277054405, -0.133592200169464, 
    6.9103658689166, 7.64737651416791, 6.75669517393108, 8.5369185279747, 
    7.08645126073423, 4.47409706618326, 4.39617687043259, 3.27924738047746, 
    6.06169418872804, 5.34939694712468, 5.58288092654703, 4.85729686493463, 
    7.38032829587839, 11.7259526759912, 4.95764559864061, 6.24066579989613, 
    3.49843659402445, 4.07498375647916, 3.55732294589389, 1.33918111568512, 
    0.956782967443242, 2.32002496709926, 3.15289777246607, -0.832211906889126, 
    6.39254974438057, 7.0533787627062, 2.97245026797807, 6.23573445580928, 
    7.6052386193207, 2.98791225155534, 3.10850022259445, 8.12060882554471, 
    -0.00459651443883508, 13.5899217198075, 9.93070913311253, 
    8.10285456644801, 5.04464304009428, 2.02262615478956, 1.0510618938653, 
    5.62233873107127, 10.1193593084848, 5.87476640145049)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -75L)))), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), groups = structure(list(
    signals = c("LPPearlyCz", "LPPearlyFCz", "LPPearlyPz", "P3Cz", 
    "P3FCz", "P3Pz"), .rows = structure(list(5L, 4L, 6L, 2L, 
        1L, 3L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), .drop = TRUE))
>

I've tried to check for the presence of outliers as follows:

outliers_table <- df_join %>%
  unnest() %>% 
  dplyr::select(COND, signals, value) %>% 
  group_by(COND) %>%  #it is the equivalent to use as grouping variable the time
  identify_outliers(value)

That turns

A tibble: 30 x 5
   COND    signals     value is.outlier is.extreme
   <fct>   <chr>       <dbl> <lgl>      <lgl>     
 1 NEG-CTR P3FCz       -11.6 TRUE       FALSE     
 2 NEG-CTR P3Cz         13.7 TRUE       FALSE     
 3 NEG-CTR P3Pz         15.1 TRUE       FALSE     
 4 NEG-CTR P3Pz         13.7 TRUE       FALSE     
 5 NEG-CTR P3Pz         23.4 TRUE       TRUE      
 6 NEG-CTR P3Pz         15.8 TRUE       FALSE     
 7 NEG-CTR P3Pz         14.4 TRUE       FALSE     
 8 NEG-CTR LPPearlyFCz -11.8 TRUE       FALSE     
 9 NEG-CTR LPPearlyCz   17.2 TRUE       FALSE     
10 NEG-CTR LPPearlyPz   25.6 TRUE       TRUE

If I'm interested in delete all of those values that are TRULY EXTREME, how could do I do by using some iterative function orr some if statment?? Please just consider also other alternative in case it is easier (also to keep on the command I've written by adding another %>% command row) that scripring down a for loop or some other function.

Since I'm at the very beginning I've coded the failing code I've created:

outliers_bale <- df_join %>%
  unnest() %>% 
  dplyr::select(COND, signals, value) %>% 
  group_by(COND) %>%  #it is the equivalent to use as grouping variable the time
  identify_outliers(value) %>% 
  filter(is.outlier & is.extreme)

values <- outliers_table$value

df_join[!(df_join$data %in% values), ]

And I am not able to figure out whether it worked or not.

Thanks in advance

解决方案

All right. Let's do it together step by step. As I understand it, you have serious concerns that in your data (I keep it in the variable df) there are outliers and even extreme values. First, we will extract from your data only one grouped tibble and filter for COND ==" NEG-NOC "

library(tidyverse)
library(rstatix)
library(outliers)

data = df$data[[1]] %>% filter(COND=="NEG-NOC")

Now let's consider what method of outlier identification we will use. We can use the boxplot function for this.

boxplot.stats(data$value)$out
#[1] 8.164181

This is fine, but it only gives us outliers in vector form. The second way is to use identify_outliers. This gives us a tibble but still only with those lines that have these outlier values.

data %>% identify_outliers(variable = "value")
# # A tibble: 1 x 7
# ID    GR    SES   COND    value is.outlier is.extreme
# <fct> <fct> <fct> <fct>   <dbl> <lgl>      <lgl>     
#   1 11    RP    V     NEG-NOC  8.16 TRUE       FALSE

Well, let's use the outlier function from the outliers package. This can give us a logic vector.

outlier(data$value, opposite = T)
#[1] 8.164181
outlier(data$value, opposite = T, logical = T)
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#[22] FALSE FALSE FALSE FALSE

However, neither of these methods will assist you in deciding what to do with these outliers. Please read this carefully . As you can see, you have three options to choose from: Imputation, Capping, Prediction. Which one will you choose? I chose Capping. So I wrote a tiny function that identifies outliers, extreme values and additionally returns your values after Capping.

fOutCapp = function(data){
  x = data$value
  qnt = quantile(x, probs=c(.25, .75), na.rm = T)
  caps = quantile(x, probs=c(.05, .95), na.rm = T)
  H = 1.5 * IQR(x, na.rm = T)
  He = 3 * IQR(x, na.rm = T)
  is.outlier = (x < (qnt[1] - H)) | (x > (qnt[2] + H))
  x[x < (qnt[1] - H)] <- caps[1]
  x[x > (qnt[2] + H)] <- caps[2]
  data %>% group_by(COND) %>% 
    mutate(
      is.outlier = is.outlier,
      is.extreme = (x < (qnt[1] - He)) | (x > (qnt[2] + He)),
      cap.value = x
    )
}

Let's see if it works

data %>% fOutCapp() %>% filter(is.outlier)
# A tibble: 1 x 8
# ID    GR    SES   COND    value is.outlier is.extreme cap.value
# <fct> <fct> <fct> <fct>   <dbl> <lgl>      <lgl>          <dbl>
#   1 11    RP    V     NEG-NOC  8.16 TRUE       FALSE           4.95
data %>% fOutCapp()
# A tibble: 25 x 8
# ID    GR    SES   COND      value is.outlier is.extreme cap.value
# <fct> <fct> <fct> <fct>     <dbl> <lgl>      <lgl>          <dbl>
#   1 01    RP    V     NEG-NOC -11.1   FALSE      FALSE        -11.1  
# 2 04    RP    V     NEG-NOC   0.239 FALSE      FALSE          0.239
# 3 06    RP    V     NEG-NOC  -2.96  FALSE      FALSE         -2.96 
# 4 07    RP    V     NEG-NOC   1.09  FALSE      FALSE          1.09 
# 5 08    RP    V     NEG-NOC   2.99  FALSE      FALSE          2.99 
# 6 09    RP    V     NEG-NOC   5.42  FALSE      FALSE          5.42 
# 7 10    RP    V     NEG-NOC  -2.83  FALSE      FALSE         -2.83 
# 8 11    RP    V     NEG-NOC   8.16  TRUE       FALSE          4.95 
# 9 12    RP    V     NEG-NOC  -9.83  FALSE      FALSE         -9.83 
# 10 13    RP    V     NEG-NOC   2.12  FALSE      FALSE          2.12 
# ... with 15 more rows

Note, however, that your data inside the variable data is grouped after the variable COND. So let's write one more tiny function that will do our fOutCapp on each of the groups.

fOutCappGroup = function(data) data %>% group_by(COND) %>% 
  group_modify(~fOutCapp(.x))

df$data[[1]] %>% fOutCappGroup()
# # A tibble: 75 x 8
# # Groups:   COND [3]
# COND    ID    GR    SES     value is.outlier is.extreme cap.value
# <fct>   <fct> <fct> <fct>   <dbl> <lgl>      <lgl>          <dbl>
#   1 NEG-CTR 01    RP    V     -11.6   FALSE      FALSE        -11.6  
# 2 NEG-CTR 04    RP    V      -0.314 FALSE      FALSE         -0.314
# 3 NEG-CTR 06    RP    V      -0.214 FALSE      FALSE         -0.214
# 4 NEG-CTR 07    RP    V      -2.83  FALSE      FALSE         -2.83 
# 5 NEG-CTR 08    RP    V       4.24  FALSE      FALSE          4.24 
# 6 NEG-CTR 09    RP    V       9.57  FALSE      FALSE          9.57 
# 7 NEG-CTR 10    RP    V      -6.13  FALSE      FALSE         -6.13 
# 8 NEG-CTR 11    RP    V       0.529 FALSE      FALSE          0.529
# 9 NEG-CTR 12    RP    V      -7.74  FALSE      FALSE         -7.74 
# 10 NEG-CTR 13    RP    V       1.27  FALSE      FALSE          1.27 
# # ... with 65 more rows

Bingo. Everything works great. Now we only needs to do one simple mutation.

df %>% group_by(signals) %>% 
  mutate(data = map(data, ~fOutCappGroup(.x))) %>% 
  unnest(data)

output

# A tibble: 450 x 9
# Groups:   signals [6]
   signals COND    ID    GR    SES     value is.outlier is.extreme cap.value
   <chr>   <fct>   <fct> <fct> <fct>   <dbl> <lgl>      <lgl>          <dbl>
 1 P3FCz   NEG-CTR 01    RP    V     -11.6   FALSE      FALSE        -11.6  
 2 P3FCz   NEG-CTR 04    RP    V      -0.314 FALSE      FALSE         -0.314
 3 P3FCz   NEG-CTR 06    RP    V      -0.214 FALSE      FALSE         -0.214
 4 P3FCz   NEG-CTR 07    RP    V      -2.83  FALSE      FALSE         -2.83 
 5 P3FCz   NEG-CTR 08    RP    V       4.24  FALSE      FALSE          4.24 
 6 P3FCz   NEG-CTR 09    RP    V       9.57  FALSE      FALSE          9.57 
 7 P3FCz   NEG-CTR 10    RP    V      -6.13  FALSE      FALSE         -6.13 
 8 P3FCz   NEG-CTR 11    RP    V       0.529 FALSE      FALSE          0.529
 9 P3FCz   NEG-CTR 12    RP    V      -7.74  FALSE      FALSE         -7.74 
10 P3FCz   NEG-CTR 13    RP    V       1.27  FALSE      FALSE          1.27 
# ... with 440 more rows

This is how your sentence has been completed. Not only did we identify outliers, but we also applied capping to them. Now decide whether to use the value variable or the cap.value variable for further analysis. The decision is yours.

A small update for a @little_statistician

First, we will load all your data.

#Loading libraries
library(tidyverse)
library(rstatix)
library(ggpubr)
library(readxl)

#Upload data
df_join <- read_excel("df_join.xlsx")

df = df_join  %>%
  mutate_at(vars(ID:COND), factor) %>%
  pivot_longer(P3FCz:LPP2Pz, names_to = "signals") %>%
  group_by(signals) %>%
  nest()

Now let's define the fOutCapp and fOutCappGroup functions once again. Note, in the original version of fOutCapp there is no need for the group_by function.

fOutCapp = function(data){
  x = data$value
  qnt = quantile(x, probs=c(.25, .75), na.rm = T)
  caps = quantile(x, probs=c(.05, .95), na.rm = T)
  H = 1.5 * IQR(x, na.rm = T)
  He = 3 * IQR(x, na.rm = T)
  is.outlier = (x < (qnt[1] - H)) | (x > (qnt[2] + H))
  x[x < (qnt[1] - H)] <- caps[1]
  x[x > (qnt[2] + H)] <- caps[2]
  data %>%  
    mutate(
      is.outlier = is.outlier,
      is.extreme = (x < (qnt[1] - He)) | (x > (qnt[2] + He)),
      cap.value = x
    )
}

fOutCappGroup = function(data) data %>% group_by(COND) %>% 
  group_modify(~fOutCapp(.x))

Now is the time to mutate.

df = df %>% group_by(signals) %>% 
  mutate(data = map(data, ~fOutCappGroup(.x))) %>% 
  unnest(data) %>% # step 1
  mutate(old.value = value,
         value = cap.value) %>% #Step 2
  nest(data=COND:old.value)  #Step 3

It is very important that you understand what is really going on here. So in step 1 we group your tibble by the signals variable. It is simple and you certainly understand it. In step 2 we mutate the data variable, which is a list consisting of data for individual signals.

output after step 2

# A tibble: 12 x 2
# Groups:   signals [12]
   signals     data                 
   <chr>       <list>               
 1 P3FCz       <grouped_df [75 x 8]>
 2 P3Cz        <grouped_df [75 x 8]>
 3 P3Pz        <grouped_df [75 x 8]>
 4 LPPearlyFCz <grouped_df [75 x 8]>
 5 LPPearlyCz  <grouped_df [75 x 8]>
 6 LPPearlyPz  <grouped_df [75 x 8]>
 7 LPP1FCz     <grouped_df [75 x 8]>
 8 LPP1Cz      <grouped_df [75 x 8]>
 9 LPP1Pz      <grouped_df [75 x 8]>
10 LPP2FCz     <grouped_df [75 x 8]>
11 LPP2Cz      <grouped_df [75 x 8]>
12 LPP2Pz      <grouped_df [75 x 8]>

This way your inner tibbles have gained new variables. You will see it after the unnest in step 3.

output after step 3

# A tibble: 900 x 9
# Groups:   signals [12]
   signals COND    ID    GR    SES     value is.outlier is.extreme cap.value
   <chr>   <fct>   <fct> <fct> <fct>   <dbl> <lgl>      <lgl>          <dbl>
 1 P3FCz   NEG-CTR 01    RP    V     -11.6   FALSE      FALSE        -11.6  
 2 P3FCz   NEG-CTR 04    RP    V      -0.314 FALSE      FALSE         -0.314
 3 P3FCz   NEG-CTR 06    RP    V      -0.214 FALSE      FALSE         -0.214
 4 P3FCz   NEG-CTR 07    RP    V      -2.83  FALSE      FALSE         -2.83 
 5 P3FCz   NEG-CTR 08    RP    V       4.24  FALSE      FALSE          4.24 
 6 P3FCz   NEG-CTR 09    RP    V       9.57  FALSE      FALSE          9.57 
 7 P3FCz   NEG-CTR 10    RP    V      -6.13  FALSE      FALSE         -6.13 
 8 P3FCz   NEG-CTR 11    RP    V       0.529 FALSE      FALSE          0.529
 9 P3FCz   NEG-CTR 12    RP    V      -7.74  FALSE      FALSE         -7.74 
10 P3FCz   NEG-CTR 13    RP    V       1.27  FALSE      FALSE          1.27 
# ... with 890 more rows

And since you already have a very nice function that generates beautiful boxplot-violin plots with different stats, let's do one small mutation (step 4) replacing value with cap.value.

output after step 4

# A tibble: 900 x 10
# Groups:   signals [12]
   signals COND    ID    GR    SES     value is.outlier is.extreme cap.value old.value
   <chr>   <fct>   <fct> <fct> <fct>   <dbl> <lgl>      <lgl>          <dbl>     <dbl>
 1 P3FCz   NEG-CTR 01    RP    V     -11.6   FALSE      FALSE        -11.6     -11.6  
 2 P3FCz   NEG-CTR 04    RP    V      -0.314 FALSE      FALSE         -0.314    -0.314
 3 P3FCz   NEG-CTR 06    RP    V      -0.214 FALSE      FALSE         -0.214    -0.214
 4 P3FCz   NEG-CTR 07    RP    V      -2.83  FALSE      FALSE         -2.83     -2.83 
 5 P3FCz   NEG-CTR 08    RP    V       4.24  FALSE      FALSE          4.24      4.24 
 6 P3FCz   NEG-CTR 09    RP    V       9.57  FALSE      FALSE          9.57      9.57 
 7 P3FCz   NEG-CTR 10    RP    V      -6.13  FALSE      FALSE         -6.13     -6.13 
 8 P3FCz   NEG-CTR 11    RP    V       0.529 FALSE      FALSE          0.529     0.529
 9 P3FCz   NEG-CTR 12    RP    V      -7.74  FALSE      FALSE         -7.74     -7.74 
10 P3FCz   NEG-CTR 13    RP    V       1.27  FALSE      FALSE          1.27      1.27 
# ... with 890 more rows

Finally, let's roll it all back to its original form with the variable data in step 5.

output after step 5

# A tibble: 12 x 2
# Groups:   signals [12]
   signals     data             
   <chr>       <list>           
 1 P3FCz       <tibble [75 x 9]>
 2 P3Cz        <tibble [75 x 9]>
 3 P3Pz        <tibble [75 x 9]>
 4 LPPearlyFCz <tibble [75 x 9]>
 5 LPPearlyCz  <tibble [75 x 9]>
 6 LPPearlyPz  <tibble [75 x 9]>
 7 LPP1FCz     <tibble [75 x 9]>
 8 LPP1Cz      <tibble [75 x 9]>
 9 LPP1Pz      <tibble [75 x 9]>
10 LPP2FCz     <tibble [75 x 9]>
11 LPP2Cz      <tibble [75 x 9]>
12 LPP2Pz      <tibble [75 x 9]>

Well now let's make a graph!

#Function to special boxplot3
SpecBoxplot3 = function(data, signal, parametric = FALSE, autor = "G. Anonim"){
  if(parametric) {
    pwc = data %>%
      pairwise_t_test(value~COND, paired = TRUE,
                      p.adjust.method = "bonferroni") %>%
      add_xy_position(x = "COND") %>%
      mutate(COND="NEG-CTR",
             lab = paste(p, " - ", p.adj.signif))
    res.test = data %>% anova_test(value~COND)
  } else {
    pwc = data %>% pairwise_wilcox_test(value~COND) %>%
      add_xy_position(x = "COND") %>%
      mutate(COND="NEG-CTR",
             lab = paste(p, " - ", p.adj.signif))
    res.test = data %>% kruskal_test(value~COND)
  }
  
  data %>% ggplot(aes(COND, value, fill=COND))+
    geom_violin(alpha=0.2)+
    geom_boxplot(outlier.shape = 23,
                 outlier.size = 3,
                 alpha=0.6)+
    geom_jitter(shape=21, width =0.1)+
    stat_pvalue_manual(pwc, step.increase=0.05, label = "lab")+
    ylab(signal)+
    labs(title = get_test_label(res.test, detailed = TRUE),
         subtitle = get_pwc_label(pwc),
         caption = autor)
}


#special boxplot for the P3FCz signal
df$data[[1]] %>% SpecBoxplot3("P3FCz", TRUE)
df$data[[1]] %>% SpecBoxplot3("P3FCz", FALSE)

As you can see on the chart, there are no outliers anymore!

Now we're ready to plot each signal!

#A function that creates a special boxplot3 and adds it to a data frame
AddSignalBoxplot3 = function(df, signal, printPlot=TRUE) {
  plot1 = SpecBoxplot3(df$data[[1]], signal, TRUE)
  plot2 = SpecBoxplot3(df$data[[1]], signal, FALSE)
  if(printPlot) print(plot1)
  if(printPlot) print(plot2)
  df %>% mutate(boxplot1 = list(plot1),
                boxplot2 = list(plot2),
  )
}

#Added special boxplot3
df %>% group_by(signals) %>%
  group_modify(~AddSignalBoxplot3(.x, .y))

Good luck on your further analysis !!

Last update

create.plot2 = function(df, group){
  data = df$data[[1]]
  minv = min(data$value)
  maxv = max(data$value)
  df.stat = data %>% group_by(COND) %>% 
    summarise(
      n = n(),
      mean = mean(value),
      sd = sd(value),
      min = minv,
      max = maxv,
      x = seq(min, max, length.out = n*100),
      value = dnorm(x, mean, sd) 
    ) 
  data %>% ggplot(aes(value, fill=COND))+
    geom_histogram(aes(y=..density..), colour="black", fill="white", bins = 30)+
    geom_density(alpha=.2, fill="red", col="red")+
    geom_line(aes(x, value), data=df.stat, col="blue")+
    xlab(group)+
    facet_grid(cols = vars(COND))
}

df %>% group_by(signals) %>% 
  group_map(create.plot2)

这篇关于如何通过迭代方法或管道运算符从嵌套数据集中删除异常值行的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

如何通过迭代方法或管道运算符从嵌套数据集中删除异常值行 [英] How to delete rows of outliers rom a nested dataset via an iterative method or pipe operator

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

如何通过迭代方法或管道运算符从嵌套数据集中删除异常值行 [英] How to delete rows of outliers rom a nested dataset via an iterative method or pipe operator

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭