(fix) generalized HTML sanitization to avoid encoding issues when replying/forwarding mails

This commit is contained in:
Ludovic Marcotte
2017-10-18 09:33:44 -04:00
parent a2f84f1358
commit 5cd3a8f245
4 changed files with 210 additions and 200 deletions

View File

@@ -110,197 +110,6 @@ _xmlCharsetForCharset (NSString *charset)
return encoding;
}
//
// In order to avoid a libxml bug/limitation, we strip the charset= parameter
// to avoid libxml to consider the charset= parameter while it works in UTF-8
// internally, all the time.
//
// A fix was commited by Daniel Veillard following discussions Inverse had
// with him on the issue:
//
// commit a1bc2f2ba4b5317885205d4f71c7c4b1c99ec870
// Author: Daniel Veillard <veillard redhat com>
// Date: Mon May 16 16:03:50 2011 +0800
//
// Add options to ignore the internal encoding
//
// For both XML and HTML, the document can provide an encoding
// either in XMLDecl in XML, or as a meta element in HTML head.
// This adds options to ignore those encodings if the encoding
// is known in advace for example if the content had been converted
// before being passed to the parser.
//
// * parser.c include/libxml/parser.h: add XML_PARSE_IGNORE_ENC option
// for XML parsing
// * include/libxml/HTMLparser.h HTMLparser.c: adds the
// HTML_PARSE_IGNORE_ENC for HTML parsing
// * HTMLtree.c: fix the handling of saving when an unknown encoding is
// defined in meta document header
// * xmllint.c: add a --noenc option to activate the new parser options
//
//
static NSData* _sanitizeContent(NSData *theData)
{
NSMutableData *d;
NSString *found_tag, *tag;
NSEnumerator *tags;
const char *bytes;
char *buf;
int i, j, len;
BOOL found_delimiter, in_meta;
d = [NSMutableData dataWithData: theData];
bytes = [d bytes];
len = [d length];
i = 0;
in_meta = NO;
while (i < len)
{
// We check if we see <meta ...> in which case, we substitute de charset= stuff.
if (i < len-5)
{
if ((*bytes == '<') &&
(*(bytes+1) == 'm' || *(bytes+1) == 'M') &&
(*(bytes+2) == 'e' || *(bytes+2) == 'E') &&
(*(bytes+3) == 't' || *(bytes+3) == 'T') &&
(*(bytes+4) == 'a' || *(bytes+4) == 'A') &&
(*(bytes+5) == ' '))
in_meta = YES;
}
// We search for something like :
//
// <meta http-equiv="Content-Type" content="text/html; charset=Windows-1252">
//
if (in_meta && i < len-9)
{
if ((*bytes == 'c' || *bytes == 'C') &&
(*(bytes+1) == 'h' || *(bytes+1) == 'H') &&
(*(bytes+2) == 'a' || *(bytes+2) == 'A') &&
(*(bytes+3) == 'r' || *(bytes+3) == 'R') &&
(*(bytes+4) == 's' || *(bytes+4) == 'S') &&
(*(bytes+5) == 'e' || *(bytes+5) == 'E') &&
(*(bytes+6) == 't' || *(bytes+6) == 'T') &&
(*(bytes+7) == '='))
{
// We search until we find a '"' or a space
j = 8;
found_delimiter = YES;
while (*(bytes+j) != ' ' && *(bytes+j) != '"' && *(bytes+j) != '\'')
{
j++;
// We haven't found anything, let's return the data untouched
if ((i+j) >= len)
{
in_meta = found_delimiter = NO;
break;
}
}
if (found_delimiter)
{
[d replaceBytesInRange: NSMakeRange(i, j)
withBytes: NULL
length: 0];
in_meta = found_delimiter = NO;
}
}
}
bytes++;
i++;
}
/*
* Replace badly formatted void tags
*
* A void tag that begins with a slash is considered invalid.
* We remove the slash from those tags.
*
* Ex: </br> is replaced by <br>
*/
if (!VoidTags)
{
/* see http://www.w3.org/TR/html4/index/elements.html */
VoidTags = [[NSArray alloc] initWithObjects: @"area", @"base",
@"basefont", @"br", @"col", @"frame", @"hr",
@"img", @"input", @"isindex", @"link",
@"meta", @"param", @"", nil];
}
bytes = [d bytes];
len = [d length];
i = 0;
while (i < len)
{
if (i < len-3)
{
// Search for ending tags
if ((*bytes == '<') && (*(bytes+1) == '/'))
{
i += 2;
bytes += 2;
j = 0;
found_delimiter = YES;
while (*(bytes+j) != '>')
{
j++;
if ((i+j) >= len)
{
found_delimiter = NO;
break;
}
}
if (found_delimiter && j > 0)
{
// Copy the ending tag to a NSString
buf = malloc((j+1) * sizeof(char));
memset (buf, 0, j+1);
memcpy (buf, bytes, j);
found_tag = [NSString stringWithCString: buf encoding: NSUTF8StringEncoding];
tags = [VoidTags objectEnumerator];
tag = [tags nextObject];
while (tag && found_tag)
{
if ([tag caseInsensitiveCompare: found_tag] == NSOrderedSame)
{
// Remove the leading slash
//NSLog(@"Found void tag with invalid leading slash: </%@>", found_tag);
i--;
[d replaceBytesInRange: NSMakeRange(i, 1)
withBytes: NULL
length: 0];
bytes = [d bytes];
bytes += i;
len = [d length];
break;
}
tag = [tags nextObject];
}
free(buf);
// Continue the parsing after end tag
i += j;
bytes += j;
}
}
}
bytes++;
i++;
}
return d;
}
@interface _UIxHTMLMailContentHandler : NSObject <SaxContentHandler, SaxLexicalHandler>
{
NSMutableString *result;
@@ -853,7 +662,7 @@ static NSData* _sanitizeContent(NSData *theData)
mail = [self clientObject];
preparsedContent = _sanitizeContent([super decodedFlatContent]);
preparsedContent = [[super decodedFlatContent] sanitizedContentUsingVoidTags: VoidTags];
parser = [[SaxXMLReaderFactory standardXMLReaderFactory]
createXMLReaderForMimeType: @"text/html"];
@@ -971,7 +780,7 @@ static NSData* _sanitizeContent(NSData *theData)
part = [self clientObject];
mail = [part mailObject];
preparsedContent = _sanitizeContent([part fetchBLOB]);
preparsedContent = [[part fetchBLOB] sanitizedContentUsingVoidTags: VoidTags];
parser = [[SaxXMLReaderFactory standardXMLReaderFactory]
createXMLReaderForMimeType: @"text/html"];
encoding = [[part partInfo] valueForKey: @"encoding"];