从Gmail原始内容解析inlineImages [英] Parsing inlineImages from Gmail raw content
问题描述
Gmail邮件getAttachments函数未返回inlineImages - 请参阅issue 2810 https://code.google.com/p/google-apps-script-issues/issues/detail?id=2810
我需要这样做,所以我编写了下面的代码,以便从消息原始内容中解析blob格式的内联图像,并提前知道消息中的图像cid。
然而,恐怕这种解析在我发现base64图像内容的第一个和最后一个字符的方式上是相当脆弱的,不是吗?
是还有更好的方法吗?
问候,Fausto
var rawc = message.getRawContent();
var b64c1 = rawc.lastIndexOf(cid)+ cid.length + 3; //图像中的第一个字符base64
var b64cn = rawc.substr(b64c1).indexOf( - ) - 3; //图像中的最后一个字符base64
var imgb64 = rawc.substring(b64c1,b64c1 + b64cn + 1); //这是脆弱还是安全?
var imgblob = Utilities.newBlob(Utilities.base64Decode(imgb64),image / jpeg,cid); //解码和blob
我有这个问题很多次,我想我有一个非常普遍的案例解决方案。获取非嵌入式图像也是一个问题。
我不确定我的解析是否比您的解析更脆弱。最后,我通过抓住以' - '$ c $开头的周围行来吸引
multipart
的部分C>。其他的一切只是确保我可以使用它,而无需在接下来需要时修改代码太多。我收到了一些似乎并不遵循 \r\\\
的电子邮件,并导致出现问题:需要寻找的东西。
getInlineImages
函数将获取消息的原始内容并返回一个对象数组。每个对象都将具有img标记的src和图像的blob。如果你只是想内联图像,你可以选择忽略任何不以'cid'开头的内容。
$ b $ <
函数getInlineImages(rawContent){
var url = / ^ https?:\ / \ //,cid = / ^ cid:/;
var imgtags = rawContent.match(/< img。*?>(。*?< \ / img>)?/ gi);
返回imgtags? imgtags.map(imgTag){
var img = {src:Xml.parse(imgTag,true).html.body.img.src};
img.blob = url.test(img .src)?UrlFetchApp.fetch(img.src).getBlob()
:cid.test(img.src)?getBlobFromMessage(rawContent,img.src)
:null;
return img;
}):[];
函数getBlobFromMessage(rawContent,src){
var cidIndex = src.search(/ cid:/ i);
if(cidIndex === -1)throw Utilities.formatString(没找到cid:inline refenece的前缀:%s,src)
var itemId = src.substr( cidIndex + 4);
var contentIdIndex = rawContent.search(Content-ID:。*?+ itemId);
if(contentIdIndex === -1)throw Utilities.formatString(找不到ID为%s的项目,src);
var previousBoundaryIndex = rawContent.lastIndexOf(\r\\\
- ,contentIdIndex);
var nextBoundaryIndex = rawContent.indexOf(\r\\\
- ,previousBoundaryIndex + 1);
var part = rawContent.substring(previousBoundaryIndex,nextBoundaryIndex);
var contentTransferEncodingLine = part.match(/ Content-Transfer-Encoding:。*?\r\\\
/ i)[0];
var encoding = contentTransferEncodingLine.split(:)[1] .trim();
if(encoding!=base64)throw Utilities.formatString(Unhandled encoding type:%s,encoding);
var contentTypeLine = part.match(/ Content-Type:。*?\r\\\
/ i)[0];
var contentType = contentTypeLine.split(:)[1] .split(;)[0] .trim();
var startOfBlob = part.indexOf(\r\\\
\r\\\
);
var blobText = part.substring(startOfBlob).replace(\r\\\
,);
返回Utilities.newBlob(Utilities.base64Decode(blobText),contentType,itemId);
}
Gmail message getAttachments function is not returning inlineImages - see issue 2810 https://code.google.com/p/google-apps-script-issues/issues/detail?id=2810
I need to do that, so I wrote the code below to parse the inline image in blob format out of the message raw content, knowing the image cid within the message, in advance.
However, I am afraid this parsing is quite fragile in the way I find the first and last character in the base64 image content, isn't it?
Is there a better way of doing this?
Regards, Fausto
var rawc = message.getRawContent();
var b64c1 = rawc.lastIndexOf(cid) + cid.length + 3; // first character in image base64
var b64cn = rawc.substr(b64c1).indexOf("--") - 3; // last character in image base64
var imgb64 = rawc.substring(b64c1, b64c1 + b64cn + 1); // is this fragile or safe enough?
var imgblob = Utilities.newBlob(Utilities.base64Decode(imgb64), "image/jpeg", cid); // decode and blob
I've had this problem a number of times, and I think I have a pretty general case solution. Getting non-embedded images has also been a problem.
I'm not sure my parsing is any less fragile than yours. In the end, I'm sucking out the part of the multipart
by grabbing the surrounding lines that start with '--'
. Everything else is just making sure I can use this without modifying the code too much when I need it next. I have had some emails which don't seem follow the \r\n
and cause problems: something to lookout for.
The getInlineImages
function will take the raw content of the message and return an array of objects. Each object will have the src of the img tag and the blob that goes with the image. If you just want inline images, you can choose to ignore anything that doesn't start with 'cid'.
The getBlobFromMessage
function will take the raw content of the message and the src of the img tag (including 'cid') and return the associated blob.
You can see the code commented here.
function getInlineImages(rawContent) {
var url = /^https?:\/\//, cid = /^cid:/;
var imgtags = rawContent.match(/<img.*?>(.*?<\/img>)?/gi);
return imgtags ? imgtags.map(function(imgTag) {
var img = {src: Xml.parse(imgTag,true).html.body.img.src};
img.blob = url.test(img.src) ? UrlFetchApp.fetch(img.src).getBlob()
: cid.test(img.src) ? getBlobFromMessage(rawContent,img.src)
: null;
return img;
}) : [];
}
function getBlobFromMessage(rawContent,src) {
var cidIndex = src.search(/cid:/i);
if(cidIndex === -1) throw Utilities.formatString("Did not find cid: prefix for inline refenece: %s", src)
var itemId = src.substr(cidIndex + 4);
var contentIdIndex = rawContent.search("Content-ID:.*?" + itemId);
if(contentIdIndex === -1) throw Utilities.formatString("Item with ID %s not found.",src);
var previousBoundaryIndex = rawContent.lastIndexOf("\r\n--",contentIdIndex);
var nextBoundaryIndex = rawContent.indexOf("\r\n--",previousBoundaryIndex+1);
var part = rawContent.substring(previousBoundaryIndex,nextBoundaryIndex);
var contentTransferEncodingLine = part.match(/Content-Transfer-Encoding:.*?\r\n/i)[0];
var encoding = contentTransferEncodingLine.split(":")[1].trim();
if(encoding != "base64") throw Utilities.formatString("Unhandled encoding type: %s",encoding);
var contentTypeLine = part.match(/Content-Type:.*?\r\n/i)[0];
var contentType = contentTypeLine.split(":")[1].split(";")[0].trim();
var startOfBlob = part.indexOf("\r\n\r\n");
var blobText = part.substring(startOfBlob).replace("\r\n","");
return Utilities.newBlob(Utilities.base64Decode(blobText),contentType,itemId);
}
这篇关于从Gmail原始内容解析inlineImages的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!