使用循环从嵌套列表中提取数据 [英] Extract data from a nested list with loops
问题描述
我从登录xml格式的网站进行了网络抓取,并将其变成了列表.现在我很难从嵌套列表中提取数据,因为它非常复杂.
I did web scraping from a login xml format website and turn it into a list already. Now I have difficulties to extract data from the nested list since it's very complicated.
这是我的z2结构的一部分:
Here is a part of my z2 structure:
dput(z2)
structure(list(scheduleList = structure(list(
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("011c"), people = list("2"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "011c", status = "-2"),
class = structure(list(name = list("013"), people = list("0"), teacher = structure(list(name = list("B")), .Names = "name", id = "D14")), .Names = c("name", "people", "teacher"), id = "602d", status = "-4"),
class = structure(list(name = list("603"), people = list("6"), teacher = structure(list(name = list("C")), .Names = "name", id = "D31")), .Names = c("name", "people", "teacher"), id = "603", status = "-4")),
.Names = c("class", "class", "class"), id = "1"),
score = structure(list(
class = structure(list(name = list("011c"), people = list("4"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "011", status = "-2"),
class = structure(list(name = list("015c"), people = list("51"), teacher = structure(list(name = list("D")), .Names = "name", id = "D23")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")),
.Names = c("class","class"), id = "2"),
score = structure(list(
class = structure(list(name = list("017c"), people = list("1"), teacher = structure(list(name = list("E")), .Names = "name", id = "D15")), .Names = c("name", "people", "teacher"), id = "017", status = "-2"),
class = structure(list(name = list("019c"), people = list("22"), teacher = structure(list(name = list("F")), .Names = "name", id = "D28")), .Names = c("name", "people", "teacher"), id = "561", status = "-4"),
class = structure(list(name = list("562d"), people = list("28"), teacher = structure(list(name = list("G")), .Names = "name", id = "D21")), .Names = c("name", "people", "teacher"), id = "562", status = "-4")),
.Names = c("class", "class", "class"), id = "3")),
.Names = c("score", "score", "score"), date = "2017-01-25"),
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("011c"), people = list("80"), teacher = structure(list(name = list("H")), .Names = "name", id = "D47")), .Names = c("name", "people", "teacher"), id = "011", status = "-4"),
class = structure(list(name = list("013c"), people = list("37"), teacher = structure(list(name = list("I")), .Names = "name", id = "D18")), .Names = c("name", "people", "teacher"), id = "669", status = "-4"),
class = structure(list(name = list("751d"), people = list("15"), teacher = structure(list(name = list("J")), .Names = "name", id = "D61")), .Names = c("name", "people", "teacher"), id = "751", status = "-4")),
.Names = c("class", "class", "class"), id = "1"),
score = structure(list(
class = structure(list(name = list("015c"), people = list("29"), teacher = structure(list(name = list("K")), .Names = "name", id = "D13")), .Names = c("name", "people", "teacher"), id = "567", status = "-2"),
class = structure(list(name = list("666d"), people = list("14"), teacher = structure(list(name = list("L")), .Names = "name", id = "D16")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")),
.Names = c("class", "class"), id = "2"),
score = structure(list(
class = structure(list(name = list("015c"), people = list("21"), teacher = structure(list(name = list("M")), .Names = "name", id = "D22")), .Names = c("name", "people", "teacher"), id = "015", status = "-4"),
class = structure(list(name = list("602d"), people = list("18"), teacher = structure(list(name = list("N")), .Names = "name", id = "D10")), .Names = c("name", "people", "teacher"), id = "602", status = "-4")),
.Names = c("class", "class"), id = "3")),
.Names = c("score", "score", "score"), date = "2017-01-26"),
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("011c"), people = list("33"), teacher = structure(list(name = list("O")), .Names = "name", id = "D30")), .Names = c("name", "people", "teacher"), id = "011", status = "-4"),
class = structure(list(name = list("013c"), people = list("70"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "601", status = "-2"),
class = structure(list(name = list("603d"), people = list("0"), teacher = structure(list(name = list("P")), .Names = "name", id = "D27")), .Names = c("name", "people", "teacher"), id = "603", status = "-4")),
.Names = c("class", "class", "class"), id = "1"),
score = structure(list(
class = structure(list(name = list("011c"), people = list("56"), teacher = structure(list(name = list("H")), .Names = "name", id = "D47")), .Names = c("name", "people", "teacher"), id = "602", status = "-4"),
class = structure(list(name = list("666d"), people = list("8"), teacher = structure(list(name = list("Q")), .Names = "name", id = "D20")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")),
.Names = c("class", "class"), id = "2"),
score = structure(list(
class = structure(list(name = list("017c"), people = list("5"), teacher = structure(list(name = list("R")), .Names = "name", id = "D30")), .Names = c("name", "people", "teacher"), id = "017", status = "-4"),
class = structure(list(name = list("021c"), people = list("6"), teacher = structure(list(name = list("S")), .Names = "name", id = "D19")), .Names = c("name", "people", "teacher"), id = "561", status = "-4")),
.Names = c("class", "class"), id = "3")),
.Names = c("score", "score", "score"), date = "2017-01-27")),
.Names = c("schedule", "schedule", "schedule"), from = "2017-01-25", to = "2017-01-27")),
.Names = "scheduleList")
这是z2的一部分:
$scheduleList$schedule$score$class
$scheduleList$schedule$score$class$name
$scheduleList$schedule$score$class$name[[1]]
[1] "017C"
$scheduleList$schedule$score$class$people
$scheduleList$schedule$score$class$people[[1]]
[1] "5"
$scheduleList$schedule$score$class$teacher
$scheduleList$schedule$score$class$teacher$name
$scheduleList$schedule$score$class$teacher$name[[1]]
[1] "R"
attr(,"id")
[1] "D30"
attr(,"id")
[1] "017"
attr(,"status")
[1] "-4"
$scheduleList$schedule$score$class
$scheduleList$schedule$score$class$name
$scheduleList$schedule$score$class$name[[1]]
[1] "021C"
$scheduleList$schedule$score$class$people
$scheduleList$schedule$score$class$people[[1]]
[1] "6"
$scheduleList$schedule$score$class$teacher
$scheduleList$schedule$score$class$teacher$name
$scheduleList$schedule$score$class$teacher$name[[1]]
[1] "S"
attr(,"id")
[1] "D19"
attr(,"id")
[1] "561"
attr(,"status")
[1] "-4"
attr(,"id")
[1] "3"
attr(,"date")
[1] "2017-01-27"
attr(,"from")
[1] "2017-01-25"
attr(,"to")
[1] "2017-01-27"
由于我是新手,因此我需要从嵌套列表中提取所需的信息,所以我使用了效率最低的方法:
I need to extract the information I need from the nested list, since I am new to this, so I use the most inefficient way to do so:
for (i in 1:length(z2[[1]])){ #length(z2[[1]])=7
for (j in 1:length(z2[[1]][[i]])){ #length(z2[[1]][[i]])=3
for (k in 1:length(z[[1]][[i]][[j]])){
cbind=(
Date=attr(z2[[1]][[i]],"date"), #date
Score=attr(z2[[1]][[i]][[j]],"id"), #score
People=z2[[1]][[i]][[j]][[k]][[2]][[1]], #people
TName=z2[[1]][[i]][[j]][[k]][[3]][[1]][[1]], #teacher name
TID=attr(z2[[1]][[i]][[j]][[k]][[3]],"id"), #teacher ID
CName=z2[[1]][[i]][[j]][[k]][[1]][[1]], #class name
CID=attr(z2[[1]][[i]][[j]][[k]],"id"), #class ID
CSta=attr(z2[[1]][[i]][[j]][[k]],"status") #class status
)
}
}
}
它在我的循环中不起作用.我想将其输出为数据帧或数组.预期的结果:
It doesn't work in my loops. And I want to output it as a data frame or array. The result that I was expected:
Date Score TID TName CName CID CSta People
2017-01-25 1 D14 B 013c 602 -4 0
2017-01-26 2 D16 L 666d 666 -4 14
XML格式的网站示例:
XML format website example:
<result status="success">
<code>1</code>
<note>success</note>
<scheduleList from="2017-01-25" to="2017-01-26">
<schedule date="2017-01-25">
<score id="1">
<class id="011" status="-4">
<name>011c</name>
<people>116</people>
<teacher id="D47">
<name>A</name>
</teacher>
</class>
<class id="669" status="-4">
<name>669d</name>
<people>10</people>
<teacher id="D29">
<name>B</name>
</teacher>
</class>
</score>
<score id="2">
<class id="013" status="-4">
<name>013c</name>
<people>9</people>
<teacher id="D9">
<name>C</name>
</teacher>
</class>
</score>
<score id="3">
<class id="016" status="-4">
<name>016c</name>
<people>36</people>
<teacher id="D18">
<name>D</name>
</teacher>
</class>
<class id="019" status="-4">
<name>019c</name>
<people>9</people>
<teacher id="D30">
<name>E</name>
</teacher>
</class>
</score>
</schedule>
<schedule date="2017-01-26">
<score id="1">
<class id="011" status="-2">
<name>011c</name>
<people>2</people>
<teacher id="D29">
<name>F</name>
</teacher>
</class>
<class id="013" status="-2">
<name>013c</name>
<people>0</people>
<teacher id="D14">
<name>G</name>
</teacher>
</class>
</score>
<score id="2">
<class id="011" status="-2">
<name>011c</name>
<people>4</people>
<teacher id="D29">
<name>F</name>
</teacher>
</class>
</score>
<score id="3">
<class id="017" status="-2">
<name>017c</name>
<people>1</people>
<teacher id="D141">
<name>H</name>
</teacher>
</class>
<class id="019" status="-4">
<name>019c</name>
<people>22</people>
<teacher id="D291">
<name>I</name>
</teacher>
</class>
<class id="020" status="-4">
<name>020c</name>
<people>8</people>
<teacher id="D143">
<name>J</name>
</teacher>
</class>
</score>
</schedule>
</scheduleList>
</result>
代码:
url <- "xxxxxxx"
session <-html_session(url)
form <-html_form(read_html(url))[[1]]
filled_form <- set_values(form,
"fromDate" = "2017-01-25",
"toDate" = "2017-01-26",
"userid" = "xxx",
"Password" = "aaa")
s <- submit_form(session,filled_form)
z = read_xml(s$response)
推荐答案
您没有分配cbind
的结果. (而且使用方法有误,请不要执行cbind=something
,等号表示错误.)
这是一种快速且可能无效的方法.
You are not assigning the result of cbind
. (And it's used in a wrong way, don't do cbind=something
, the equal sign is an error.)
This is a quick and possibly ineffective way of doing it.
result <- data.frame()
for (i in 1:length(z2[[1]])){ #length(z2[[1]])=7
for (j in 1:length(z2[[1]][[i]])){ #length(z2[[1]][[i]])=3
for (k in 1:length(z2[[1]][[i]][[j]])){
row <- cbind(
Date=attr(z2[[1]][[i]],"date"), #date
Score=attr(z2[[1]][[i]][[j]],"id"), #score
People=z2[[1]][[i]][[j]][[k]][[2]][[1]], #people
TName=z2[[1]][[i]][[j]][[k]][[3]][[1]][[1]], #teacher name
TID=attr(z2[[1]][[i]][[j]][[k]][[3]],"id"), #teacher ID
CName=z2[[1]][[i]][[j]][[k]][[1]][[1]], #class name
CID=attr(z2[[1]][[i]][[j]][[k]],"id"), #class ID
CSta=attr(z2[[1]][[i]][[j]][[k]],"status") #class status
)
result <- rbind(result, row)
}
}
}
head(result)
Date Score People TName TID CName CID CSta
1 2017-01-25 1 2 A D29 011c 011c -2
2 2017-01-25 1 0 B D14 013 602d -4
3 2017-01-25 1 6 C D31 603 603 -4
4 2017-01-25 2 4 A D29 011c 011 -2
5 2017-01-25 2 51 D D23 015c 666 -4
6 2017-01-25 3 1 E D15 017c 017 -2
这篇关于使用循环从嵌套列表中提取数据的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!