VBA没有完全获取Web表 [英] Web table is not being completely fetched by VBA

查看:130
本文介绍了VBA没有完全获取Web表的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我需要从






示例代码输出:








VBA代码:

 选项明确

'工具>参考文献> HTML对象库
Public Sub GetTable()

Dim sResponse As String,listItems As Object,html As HTMLDocument,headers()
headers = Array(product_id,product_name ,product_price,product_category,currency,spr,shop_name,delivery_time,shop_rating,position,free_return,approved_shipping)

Application.ScreenUpdating = False

使用CreateObject(MSXML2.XMLHTTP)
。打开GET,https://www.idealo.de/preisvergleich/OffersOfProduct/143513.html ,False
.send
sResponse = StrConv(。responseseBody,vbUnicode)
End with

sResponse = Mid $(sResponse,InStr(1,sResponse, <!DOCTYPE))
设置html =新HTMLDocument
使用html
.body.innerHTML = sResponse
设置listItems = .getElementsByClassName(productOffers-listItemOfferPrice)
结束时

Dim currentItem As Long
使用ActiveSheet
.Cells(1,1).Resize(1,UBound(headers)+ 1)= headers
For currentItem = 0 to listItems.Length - 1
Dim tempString As String,columnValues()As String
tempString = TidyString(GetTransactionInfo(listItems(currentItem).outerHTML),&#\\\ +;)
columnValues = GetColumnValues(tempString,headers)
.Cells(currentItem + 2, 1).Resize(1,UBound(columnValues)+ 1)= columnValues
Next currentItem
End with
Application.ScreenUpdating = True
End Sub

公共函数GetTransactionInfo(ByVal inputString)As String
'拆分以获取事务项,即标题和关联值
GetTransactionInfo =拆分(拆分(inputString,transaction,)(1 ),})(0)
结束函数

公共函数TidyString(ByVal inputString As String,ByVal matchPattern As String)As String
'提取交易信息
'使用正则表达式找到这些不需要的字符串并替换模式&安培;#\d +;
'示例inputString

Dim regex As Object,tempString As String
设置regex = CreateObject(VBScript.RegExp)

使用regex
.Global = True
.MultiLine = True
.IgnoreCase = False
.Pattern = matchPattern


结束如果是regex.TEST(inputString) )然后
TidyString = regex.Replace(inputString,vbNullString)
否则
TidyString = inputString
结束如果
结束函数

公共函数GetColumnValues(ByVal inputString As String,ByVal headers As Variant)As Variant
'示例输入字符串product_id:143513,product_name:Canon 500D Nahlinse 72mm,product_price:128.0, product_category:26570,currency:EUR,spr:cfd,shop_name:computeruniverse.net,delivery_time:long,shop_rating:100, position:1,free_return:14,approved_shipping:false
'仅提取内部字符串val每个标题的例子,例如143513
Dim arr()As String,currentItem As Long,tempString As String
tempString = inputString
for currentItem = LBound(headers)To UBound(headers)
tempString = TidyString( tempString,Chr $(34)& headers(currentItem)& Chr $(34)&:)
Next currentItem
arr = Split(替换$(tempString,Chr $(34) ,vbNullString),,)
GetColumnValues = arr
结束函数


I need to fetch the price table from this site.

For this I have already developed some code:

Sub TableExample()
    Dim IE As Object
    Dim doc As Object
    Dim strURL As String

    strURL = "http://www.idealo.de/preisvergleich/OffersOfProduct/143513.html"

    ' replace with URL of your choice

    Set IE = CreateObject("InternetExplorer.Application")
    With IE
         '.Visible = True

        .navigate strURL
        Do Until .readyState = 4: DoEvents: Loop
            Do While .Busy: DoEvents: Loop
                Set doc = IE.document
                GetAllTables doc

                .Quit
            End With
        End Sub

       Sub GetAllTables(doc As Object)

     ' get all the tables from a webpage document, doc, and put them in a new worksheet

    Dim ws As Worksheet
    Dim rng As Range
    Dim tbl As Object
    Dim rw As Object
    Dim cl As Object
    Dim tabno As Long
    Dim nextrow As Long
    Dim I As Long

    Set ws = Sheets("Sheet1")


    For Each tbl In doc.getElementsByTagName("TABLE")
        tabno = tabno + 1
        nextrow = nextrow + 1
        Set rng = ws.Range("B" & nextrow)
        'rng.Offset(, -1) = "Table " & tabno
        If tabno = 5 Then

        For Each rw In tbl.Rows
            colno = 6
            For Each cl In rw.Cells
                If colno = 5 And nextrow < 1 Then
                    Set classColl = doc.getElementsByClassName("shop")
                    Set imgTgt = classColl(nextrow - 2).getElementsByTagName("img").getElementsByClassName("btn-goto-shop")
                    rng.Value = imgTgt(0).getAttribute("alt")

                Else
                    rng.Value = cl.innerText
                End If
                Set rng = rng.Offset(, 1)
                I = I + 1
                colno = colno + 1
            Next cl
            nextrow = nextrow + 1
            Set rng = rng.Offset(1, -I)
            I = 0
        Next rw
        End If
    Next tbl

    ws.Cells.ClearFormats

End Sub

Through this code I can get the desired result, except that the last column with the shop name given is not being fetched. Can anyone help me with this?

解决方案

If you inspect the HTML for the page you can see that the elements with className productOffers-listItemOfferPrice have the required info. There is more information than perhaps you might realise. See my code output at bottom.



In the main sub GetTable, I use an XHR request to get the page HTML and store it in an HTML document.

When you do .getElementsByClassName("productOffers-listItemOfferPrice") to get all the item info, you need to parse each elements .outerHTML.


The helper function GetTransactionInfo uses split function to get just the product info part of the .outerHTML. The returned string looks like the following example sample:

"&#10;&#9;&#9;&#9;"product_id": &#9;&#9;&#9;"143513",&#10;&#9;&#9;&#9;"product_name": ..."


The helper function TidyString takes an inputString and regex pattern, applies regex pattern matching to tidy the product information string, by matching unwanted strings and replacing them with empty literal strings (vbNullString).


Regex pattern 1:

For example, the first regex pattern "&#\d+;" gets rid of all the &# with numbers in the string:

Try it


Regex pattern 2:

The second regex pattern, Chr$(34) & headers(currentItem) & Chr$(34) & ":", removes the product header information from the string i.e. to get just the values.

E.g. it takes "product_id": "143513" and returns "143513".

Try it


Example page info (sample)


Example code output:


VBA Code:

Option Explicit

'Tools > References > HTML Object Library
Public Sub GetTable()

    Dim sResponse As String, listItems As Object, html As HTMLDocument, headers()
    headers = Array("product_id", "product_name", "product_price", "product_category", "currency", "spr", "shop_name", "delivery_time", "shop_rating", "position", "free_return", "approved_shipping")

    Application.ScreenUpdating = False

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", "https://www.idealo.de/preisvergleich/OffersOfProduct/143513.html", False
        .send
        sResponse = StrConv(.responseBody, vbUnicode)
    End With

    sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
    Set html = New HTMLDocument
    With html
        .body.innerHTML = sResponse
        Set listItems = .getElementsByClassName("productOffers-listItemOfferPrice")
    End With

    Dim currentItem As Long
    With ActiveSheet
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        For currentItem = 0 To listItems.Length - 1
            Dim tempString As String, columnValues() As String
            tempString = TidyString(GetTransactionInfo(listItems(currentItem).outerHTML), "&#\d+;")
            columnValues = GetColumnValues(tempString, headers)
            .Cells(currentItem + 2, 1).Resize(1, UBound(columnValues) + 1) = columnValues
        Next currentItem
    End With
    Application.ScreenUpdating = True
End Sub

Public Function GetTransactionInfo(ByVal inputString) As String
    'Split to get just the transaction items i.e. Headers and associated values
    GetTransactionInfo = Split(Split(inputString, """transaction"",")(1), "}")(0)
End Function

Public Function TidyString(ByVal inputString As String, ByVal matchPattern As String) As String
    'Extract transaction info
    'Use regex to find these unwanted strings and replace pattern e.g. &#\d+;
    'Example inputString

    Dim regex As Object, tempString As String
    Set regex = CreateObject("VBScript.RegExp")

    With regex
        .Global = True
        .MultiLine = True
        .IgnoreCase = False
        .Pattern = matchPattern
    End With

    If regex.TEST(inputString) Then
        TidyString = regex.Replace(inputString, vbNullString)
    Else
        TidyString = inputString
    End If
End Function

Public Function GetColumnValues(ByVal inputString As String, ByVal headers As Variant) As Variant
    ' Example input string "product_id": "143513","product_name": "Canon 500D Nahlinse 72mm","product_price": "128.0","product_category": "26570","currency": "EUR","spr": "cfd","shop_name": "computeruniverse.net","delivery_time": "long","shop_rating": "100","position": "1","free_return": "14","approved_shipping": "false"
    ' Extract just the inner string value of each header e.g. 143513
    Dim arr() As String, currentItem As Long, tempString As String
    tempString = inputString
    For currentItem = LBound(headers) To UBound(headers)
        tempString = TidyString(tempString, Chr$(34) & headers(currentItem) & Chr$(34) & ":")
    Next currentItem
    arr = Split(Replace$(tempString, Chr$(34), vbNullString), ",")
    GetColumnValues = arr
End Function

这篇关于VBA没有完全获取Web表的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆