From 1782f9b18d587ed26a11215601bb608908a38ad7 Mon Sep 17 00:00:00 2001 From: Ludovic Marcotte Date: Mon, 16 May 2011 15:08:23 +0000 Subject: [PATCH] See ChangeLog Monotone-Parent: 7543dc34aa33702ddf79bcb43feab4d72175bfc2 Monotone-Revision: 5fb0fae471bb22f9c4ce7361ef22d9d213f30c0b Monotone-Author: ludovic@Sophos.ca Monotone-Date: 2011-05-16T15:08:23 Monotone-Branch: ca.inverse.sogo --- ChangeLog | 12 ++- UI/MailPartViewers/UIxMailPartHTMLViewer.m | 108 ++++++++++++++++++++- 2 files changed, 116 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index d5396e8ce..2cc76f1c7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,16 @@ +2011-05-16 Ludovic Marcotte + + * UI/MailPartViewers/UIxMailPartHTMLViewer.m: Added a + workaround for a libxml's issue where the charset + specified in meta tags have precedence over the one + we "force" when creating the parser. This "bug" has + just been fixed in libxml but it'll take years before + all distributions get that fix. For now, we strip + that tag, unconditionnaly. + 2011-05-10 Ludovic Marcotte - * NSDictionary+Utilities.m (userRecordAsLDIFEntry): + * NSDictionary+Utilities.m (userRecordAsLDIFEntry): Fix to avoid a crash when backing up resources' LDIF entries 2011-05-06 Francis Lachapelle diff --git a/UI/MailPartViewers/UIxMailPartHTMLViewer.m b/UI/MailPartViewers/UIxMailPartHTMLViewer.m index 3581bbcad..d7dd78e7b 100644 --- a/UI/MailPartViewers/UIxMailPartHTMLViewer.m +++ b/UI/MailPartViewers/UIxMailPartHTMLViewer.m @@ -1,8 +1,9 @@ /* UIxMailPartHTMLViewer.m - this file is part of SOGo * - * Copyright (C) 2007-2010 Inverse inc. + * Copyright (C) 2007-2011 Inverse inc. * * Author: Wolfgang Sourdeau + * Ludovic Marcotte * * This file is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,6 +22,7 @@ */ #import +#import #import #import #import @@ -102,6 +104,106 @@ _xmlCharsetForCharset (NSString *charset) return encoding; } +// +// In order to avoid a libxml bug/limitation, we strip the charset= parameter +// to avoid libxml to consider the charset= parameter while it works in UTF-8 +// internally, all the time. +// +// A fix was commited by Daniel Veillard following discussions Inverse had +// with him on the issue: +// +// commit a1bc2f2ba4b5317885205d4f71c7c4b1c99ec870 +// Author: Daniel Veillard +// Date: Mon May 16 16:03:50 2011 +0800 +// +// Add options to ignore the internal encoding +// +// For both XML and HTML, the document can provide an encoding +// either in XMLDecl in XML, or as a meta element in HTML head. +// This adds options to ignore those encodings if the encoding +// is known in advace for example if the content had been converted +// before being passed to the parser. +// +// * parser.c include/libxml/parser.h: add XML_PARSE_IGNORE_ENC option +// for XML parsing +// * include/libxml/HTMLparser.h HTMLparser.c: adds the +// HTML_PARSE_IGNORE_ENC for HTML parsing +// * HTMLtree.c: fix the handling of saving when an unknown encoding is +// defined in meta document header +// * xmllint.c: add a --noenc option to activate the new parser options +// +// +static NSData* _sanitizeContent(NSData *theData) +{ + NSMutableData *d; + const char *bytes; + int i, j, len; + BOOL seen_head; + + d = [NSMutableData dataWithData: theData]; + bytes = [d bytes]; + len = [d length]; + seen_head = NO; + i = 0; + + while (i < len) + { + // We check if we see in which case, we don't do any kind + // of substitution there after. + if (i < len-5) + { + if ((*bytes == '<') && + (*(bytes+1) == '/') && + (*(bytes+1) == 'h' || *(bytes+1) == 'H') && + (*(bytes+2) == 'e' || *(bytes+2) == 'E') && + (*(bytes+3) == 'a' || *(bytes+3) == 'A') && + (*(bytes+4) == 'd' || *(bytes+4) == 'D') && + (*(bytes+7) == '>')) + seen_head = YES; + } + + // We search for something like : + // + // + // + if (!seen_head && i < len-9) + { + if ((*bytes == 'c' || *bytes == 'C') && + (*(bytes+1) == 'h' || *(bytes+1) == 'H') && + (*(bytes+2) == 'a' || *(bytes+2) == 'A') && + (*(bytes+3) == 'r' || *(bytes+3) == 'R') && + (*(bytes+4) == 's' || *(bytes+4) == 'S') && + (*(bytes+5) == 'e' || *(bytes+5) == 'E') && + (*(bytes+6) == 't' || *(bytes+6) == 'T') && + (*(bytes+7) == '=')) + { + // We search until we find a '"' or a space + j = 8; + + //while (*(bytes+j) != ' ' || *(bytes+j) != '"') + while (*(bytes+j) != '"') + { + j++; + + // We haven't found anything, let's return the data untouched + if ((i+j) >= len) + return theData; + } + + [d replaceBytesInRange: NSMakeRange(i, j) + withBytes: NULL + length: 0]; + break; + } + } + + bytes++; + i++; + } + + return d; +} + @interface _UIxHTMLMailContentHandler : NSObject { NSMutableString *result; @@ -591,7 +693,7 @@ _xmlCharsetForCharset (NSString *charset) mail = [self clientObject]; - preparsedContent = [super decodedFlatContent]; + preparsedContent = _sanitizeContent([super decodedFlatContent]); parser = [[SaxXMLReaderFactory standardXMLReaderFactory] createXMLReaderForMimeType: @"text/html"]; @@ -696,7 +798,7 @@ _xmlCharsetForCharset (NSString *charset) part = [self clientObject]; mail = [part mailObject]; - preparsedContent = [part fetchBLOB]; + preparsedContent = _sanitizeContent([part fetchBLOB]); parser = [[SaxXMLReaderFactory standardXMLReaderFactory] createXMLReaderForMimeType: @"text/html"]; encoding = [[part partInfo] valueForKey: @"encoding"];