从网站上的表格中抓取数据,而无需搜索标签 [英] scrape data from a table on a website without having to search for tags
问题描述
这是该问题的延续
这就是我的意思,但是现在它在第一页之后停止,加粗的行突出显示了
公共Sub VisitPages()
Dim IE作为新的InternetExplorer
与IE
.Visible = True
.navigate http://healthapps.state.nj.us/facilities/acSetSearch.aspx?by=county
而.Busy或.ReadyState< 4:DoEvents:通过.Document
.querySelector(#middleContent_cbType_1)。Went
。单击
.querySelector(#middleContent_cbType_4)。单击
.querySelector ( #middleContent_btnGetList)。单击
以
结尾的
而.Busy或.ReadyState< 4:DoEvents:Wend
昏暗列表作为对象,我只要
设置列表= .Document.querySelectorAll(#main_table [href * = doPostBack])
For i = 0要列出。长度-1
list.Item(i)。单击.bsy或.ReadyState时单击
。 4:DoEvents:Wend
’应用程序。立即等待+ TimeSerial(0,0,3)’===稍后删除我。这只是为了演示页面更改
** Debug.Print .Document.getElementById( middleContent_lbName_county)。outerHTML **
'用新页面执行操作
Dim FirstOcc As Long
Dim TtlHosp作为变体
Dim FLine作为变体
Dim FLine固定为Variant
TtlHosp = Mid(.Document.Body.innerHTML,InStr(.Document.Body .innerHTML,病床总数,4000)
直到InStr(TtlHosp,< td align =& Chr(34)& left& Chr(34) &>)= 0
FirstOcc = InStr(TtlHosp,< td align =& Chr(34)& left& Chr(34)& >)
FLine = Mid(TtlHosp,FirstOcc + 17,150)
FLineFixed = Mid(FLine,1,InStr(FLine,< / td>)-1)
Debug.Print FLineFixed
TtlHosp = Mid(TtlHosp,FirstOcc + 17,2000)
循环
.Navigate2 .Document.URL’< ==返回首页
而.Busy或.ReadyState< 4:DoEvents:Wend
设置列表= .Document.querySelectorAll(#main_table [href * = doPostBack])'重置列表(在这些情况下通常是必需的)
下一页
Stop'< ; ==稍后删除我
'。退出'< ==记住退出应用程序
以
结尾结束
看来,该信息仅用于特殊医院的选择,而床位信息位于第三张表中。 (注意:结果当前是用Excel编写的)
Option Explicit
Public Sub VisitPages()
Dim IE作为新的InternetExplorer,ws作为工作表
设置ws = ThisWorkbook.Worksheets( Sheet1)
与IE
.Visible = True
.navigate http:// healthapps。 state.nj.us/facilities/acSetSearch.aspx?by=county
而.Busy或.readyState< 4:DoEvents:通过.document
.querySelector(#middleContent_cbType_5)。Went
。单击
.querySelector(#middleContent_btnGetList)。单击
结尾为
而.Busy或.readyState< 4:DoEvents:Wend
昏暗列表作为对象,我只要
设置列表= .document.querySelectorAll(#main_table [href * = doPostBack])
对于我= 0要列出。长度-1
list.item(i)。单击
而.Busy或.readyState< 4:DoEvents:Wend
WriteTable .document.getElementsByTagName( table)(3),.document.getElementById( middleContent_lbName_county)。innerText,GetLastRow(ws,3)+ 1,ws
'用新页面
做事。Navigate2 .document.URL'&=; ==返回首页
而.Busy或.readyState< 4:DoEvents:Wend
设置列表= .document.querySelectorAll(#main_table [href * = doPostBack])'重置列表(在这些情况下通常是必需的)
下一个
.Quit' < ==记住退出应用程序
以
结尾的子程序
公共函数GetLastRow(ByVal ws作为工作表,可选的ByVal columnNumber只要Long = 1)只要
with ws
GetLastRow = .Cells(.Rows.Count,columnNumber).End(xlUp).Row
End with
End Function
Public Sub WriteTable( ByVal hTable作为HTMLTable,设施作为字符串,可选ByVal startRow作为Long = 1,可选ByVal ws作为工作表)
如果ws没有,则设置ws = ActiveSheet
Dim tRow作为对象,tCell作为对象,作为对象,作为对象,作为对象,作为对象,作为对象,作为标题,作为标题行
r = startRow:titleRow = startRow
ws
设置tRow = hTable.getElementsByTagName( tr)
.Cells(titleRow, 1)=设施
对于每个tr in tRow
r = r + 1
Set tCell = tr.getElementsByTagName( td)
c = 2
对于每个td In tCell
.Cells(r,c).Value = td.innerText
c = c + 1
下一个td
下一个tr
以
结尾的结尾
this is a continuation from this question using InStr to search for quotes, spaces, colons, etc
i'm also trying to get all this data below. i was going to do a loop that will search for <td align="left">
but i have a feeling it will bring in a ton of garbage along with the results i need. i'm wondering if there's a better way of doing this.
<b>Total Hospital Beds</b></td>
<td align="left">Adult ICU (intensive care unit) CCU (critical care unit)</td>
<td align="left">26</td>
<td align="left">Medical/surgical</td>
<td align="left">198</td>
<td align="left">Pediatric</td>
<td align="left">20</td>
<td align="center" colspan="2"><b>Services</b></td>
<td align="left">Acute Hemodialoysis Service</td>
<td align="left">Chronic Hemodialysis Stations</td>
<td align="left">Magnetic Resonance Imaging - On Site</td>
<td align="left">Mixed OR's</td>
<td align="left">7</td>
basically, everything at the section on the bottom, screenshot attached
This is what I have but now it stops after the first page, the bolded line is highlighted
Public Sub VisitPages()
Dim IE As New InternetExplorer
With IE
.Visible = True
.navigate "http://healthapps.state.nj.us/facilities/acSetSearch.aspx?by=county"
While .Busy Or .ReadyState < 4: DoEvents: Wend
With .Document
.querySelector("#middleContent_cbType_1").Click
.querySelector("#middleContent_cbType_4").Click
.querySelector("#middleContent_btnGetList").Click
End With
While .Busy Or .ReadyState < 4: DoEvents: Wend
Dim list As Object, i As Long
Set list = .Document.querySelectorAll("#main_table [href*=doPostBack]")
For i = 0 To list.Length - 1
list.Item(i).Click
While .Busy Or .ReadyState < 4: DoEvents: Wend
' Application.Wait Now + TimeSerial(0, 0, 3) '<== Delete me later. This is just to demo page changes
**Debug.Print .Document.getElementById("middleContent_lbName_county").outerHTML**
'do stuff with new page
Dim FirstOcc As Long
Dim TtlHosp As Variant
Dim FLine As Variant
Dim FLineFixed As Variant
TtlHosp = Mid(.Document.Body.innerHTML, InStr(.Document.Body.innerHTML, "Total Hospital Beds"), 4000)
Do Until InStr(TtlHosp, "<td align=" & Chr(34) & "left" & Chr(34) & ">") = 0
FirstOcc = InStr(TtlHosp, "<td align=" & Chr(34) & "left" & Chr(34) & ">")
FLine = Mid(TtlHosp, FirstOcc + 17, 150)
FLineFixed = Mid(FLine, 1, InStr(FLine, "</td>") - 1)
Debug.Print FLineFixed
TtlHosp = Mid(TtlHosp, FirstOcc + 17, 2000)
Loop
.Navigate2 .Document.URL '<== back to homepage
While .Busy Or .ReadyState < 4: DoEvents: Wend
Set list = .Document.querySelectorAll("#main_table [href*=doPostBack]") 'reset list (often required in these scenarios)
Next
Stop '<== Delete me later
'.Quit '<== Remember to quit application
End With
End Sub
It appears that info is only for special hospital selection and that the bed info is in the third table. (Note: Results are currently written in Excel)
Option Explicit
Public Sub VisitPages()
Dim IE As New InternetExplorer, ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
With IE
.Visible = True
.navigate "http://healthapps.state.nj.us/facilities/acSetSearch.aspx?by=county"
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
.querySelector("#middleContent_cbType_5").Click
.querySelector("#middleContent_btnGetList").Click
End With
While .Busy Or .readyState < 4: DoEvents: Wend
Dim list As Object, i As Long
Set list = .document.querySelectorAll("#main_table [href*=doPostBack]")
For i = 0 To list.Length - 1
list.item(i).Click
While .Busy Or .readyState < 4: DoEvents: Wend
WriteTable .document.getElementsByTagName("table")(3), .document.getElementById("middleContent_lbName_county").innerText, GetLastRow(ws, 3) + 1, ws
'do stuff with new page
.Navigate2 .document.URL '<== back to homepage
While .Busy Or .readyState < 4: DoEvents: Wend
Set list = .document.querySelectorAll("#main_table [href*=doPostBack]") 'reset list (often required in these scenarios)
Next
.Quit '<== Remember to quit application
End With
End Sub
Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
With ws
GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
End With
End Function
Public Sub WriteTable(ByVal hTable As HTMLTable, facility As String, Optional ByVal startRow As Long = 1, Optional ByVal ws As Worksheet)
If ws Is Nothing Then Set ws = ActiveSheet
Dim tRow As Object, tCell As Object, tr As Object, td As Object, r As Long, c As Long, titleRow As Long
r = startRow: titleRow = startRow
With ws
Set tRow = hTable.getElementsByTagName("tr")
.Cells(titleRow, 1) = facility
For Each tr In tRow
r = r + 1
Set tCell = tr.getElementsByTagName("td")
c = 2
For Each td In tCell
.Cells(r, c).Value = td.innerText
c = c + 1
Next td
Next tr
End With
End Sub
这篇关于从网站上的表格中抓取数据,而无需搜索标签的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!