如何使用Python解码Angular的自定义HTML编码 [英] How to decode Angular's custom HTML encoding with Python
问题描述
我想抓取并解析,这意味着这是Angular的自定义HTML编码.
例如:
& l; div class = \"news-body-content \"& g;& l; html xmlns = \"http://www.w3.org/1999/xhtml \& g; \ n& l; head& g; \ n& l; meta http-equiv = \" Content-Type \"content = \" text/html;charset = UTF-8 \"/& g; \ n& l; title& g;/title& g; \ n& l;元名称= \"generator \";
我使用 .replace()
链来处理此问题:
import json汇入要求从bs4导入BeautifulSoupurl ="https://www.londonstockexchange.com/news-article/ESNT/date-for-fy-2020-results-announcement/14850033"脚本= BeautifulSoup(requests.get(url).text,"lxml").find("script",{"id":"ng-lseg-state"})article = json.loads(script.string.replace(& q;",''')))main_key ="G.{{api_endpoint}}/api/v1/pages?parameters = newsId%3D14850033& a; path = news-article"article_body = article [main_key] ["body"] ["components"] [1] ["content"] ["newsArticle"] ["value"]encoded_body =(article_body.replace('& l;','<').replace('& g;','>').replace('& q;','"'))print(BeautifulSoup(decoded_body,"lxml").find_all("p")))
但是还有一些我不确定如何处理的字符:
-
& amp; a;#160;
-
& amp; a; amp;
-
& s;
仅举几例.
所以,问题是,我如何处理其余字符?也许有一个解析器或可靠的char映射在那里我不知道?
角度编码传输状态使用位于 <代码>导出函数escapeHtml(text:string):string {const escapedText:{[k:字符串]:字符串} = {'&':'& a;','" ;:'& q;','\'':'& s;','<':'& l;','>':'& g;',};return text.replace(/[&''<>]/g,s => scapedText [s]);}导出函数unescapeHtml(text:string):string {const unescapedText:{[k:字符串]:字符串} = {'& a':'&','& q:':'"','& s':'\'','& l':'<','& g:':'>',};返回text.replace(/& [^;] +;/g,s => unescapedText [s]);} 您可以在python中重现 您缺少 repl.it: https://replit.com/@bertrandmartel/AngularTransferStateDecode > I want to scrape and parse a London Stock Exchange news article. Almost the entire content of the site comes from a But the encoding of the script is a bit funky. The For example: I handle this with a But there are still some characters that I'm not sure how to handle: just to name a few. So, the question is, how do I deal with the rest of the chars? Or maybe there's a parser or a reliable char mapping out there that I don't know of? Angular encodes transfer state using a special escape function located here: You can reproduce the you were missing repl.it: https://replit.com/@bertrandmartel/AngularTransferStateDecode 这篇关于如何使用Python解码Angular的自定义HTML编码的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋! unescapeHtml
函数,并添加 html.unescape
来解析其他html实体:
import json汇入要求从bs4导入BeautifulSoup导入HTMLunescapedText = {'& a':'&','& q:':'"','& s':'\'','& l':'<','& g:':'>',}def unescape(str):对于键,为unescapedText.items()中的值:str = str.replace(键,值)返回html.unescape(str)url ="https://www.londonstockexchange.com/news-article/ESNT/date-for-fy-2020-results-announcement/14850033"脚本= BeautifulSoup(requests.get(url).text,"lxml").find("script" ,, {"id":"ng-lseg-state";})有效负载= json.loads(unescape(script.string))main_key ="G.{{api_endpoint}}/api/v1/pages?parameters = newsId%3D14850033& path = news-article"article_body = payload [main_key] ["body"] ["components"] [1] ["content"] ["newsArticle"] ["value"]打印(BeautifulSoup(article_body,"lxml").find_all("p"))
& s
和& a;
JSON
that's consumed by JavaScript
. However, this can be easily extracted with BeautifulSoup
and parsed with the JSON
module.<script>
tag has an id
of "ng-lseg-state"
, which means this is Angular's custom HTML encoding.&l;div class=\"news-body-content\"&g;&l;html xmlns=\"http://www.w3.org/1999/xhtml\"&g;\n&l;head&g;\n&l;meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" /&g;\n&l;title&g;&l;/title&g;\n&l;meta name=\"generator\"
.replace()
chain:import json
import requests
from bs4 import BeautifulSoup
url = "https://www.londonstockexchange.com/news-article/ESNT/date-for-fy-2020-results-announcement/14850033"
script = BeautifulSoup(requests.get(url).text, "lxml").find("script", {"id": "ng-lseg-state"})
article = json.loads(script.string.replace("&q;", '"'))
main_key = "G.{{api_endpoint}}/api/v1/pages?parameters=newsId%3D14850033&a;path=news-article"
article_body = article[main_key]["body"]["components"][1]["content"]["newsArticle"]["value"]
decoded_body = (
article_body
.replace('&l;', '<')
.replace('&g;', '>')
.replace('&q;', '"')
)
print(BeautifulSoup(decoded_body, "lxml").find_all("p"))
&a;#160;
&a;amp;
&s;
export function escapeHtml(text: string): string {
const escapedText: {[k: string]: string} = {
'&': '&a;',
'"': '&q;',
'\'': '&s;',
'<': '&l;',
'>': '&g;',
};
return text.replace(/[&"'<>]/g, s => escapedText[s]);
}
export function unescapeHtml(text: string): string {
const unescapedText: {[k: string]: string} = {
'&a;': '&',
'&q;': '"',
'&s;': '\'',
'&l;': '<',
'&g;': '>',
};
return text.replace(/&[^;]+;/g, s => unescapedText[s]);
}
unescapeHtml
function in python, and add html.unescape
to resolve additionnal html entities:import json
import requests
from bs4 import BeautifulSoup
import html
unescapedText = {
'&a;': '&',
'&q;': '"',
'&s;': '\'',
'&l;': '<',
'&g;': '>',
}
def unescape(str):
for key, value in unescapedText.items():
str = str.replace(key, value)
return html.unescape(str)
url = "https://www.londonstockexchange.com/news-article/ESNT/date-for-fy-2020-results-announcement/14850033"
script = BeautifulSoup(requests.get(url).text, "lxml").find("script", {
"id": "ng-lseg-state"
})
payload = json.loads(unescape(script.string))
main_key = "G.{{api_endpoint}}/api/v1/pages?parameters=newsId%3D14850033&path=news-article"
article_body = payload[main_key]["body"]["components"][1]["content"]["newsArticle"]["value"]
print(BeautifulSoup(article_body, "lxml").find_all("p"))
&s;
and &a;