Skip to content

Commit 29e128e

Browse files
committed
Fix adoption agency algorithm bookmark.
Using just an index may be doable but was not being done right here. Instead, follow the spec and keep the bookmark relative to surrounding nodes, being sure to update it as nodes are replaced. Fixes #95.
1 parent ba104cd commit 29e128e

File tree

5 files changed

+93
-22
lines changed

5 files changed

+93
-22
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
## [Unreleased]
44

5+
* Correctly parse some previously-failing mis-nested combinations of `a` and formatting elements. (Fixes #95.)
6+
57
## [2.2][]
68

79
* When no character set is specified or detected, fall back to Windows-1252 with replacement characters on iOS 8+ and Mac OS X 10.10+.

HTMLReader.xcodeproj/project.pbxproj

+8
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,9 @@
147147
1CACE9EA1783AA6600754A8F /* HTMLString.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CACE9E91783AA6600754A8F /* HTMLString.m */; };
148148
1CB0B960183F2C7100021DBE /* HTMLPreprocessedInputStream.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CB0B95E183F2C7100021DBE /* HTMLPreprocessedInputStream.m */; };
149149
1CB0B961183F2C7100021DBE /* HTMLPreprocessedInputStream.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CB0B95E183F2C7100021DBE /* HTMLPreprocessedInputStream.m */; };
150+
1CB5431128EE94C100110E0D /* HTMLRegressionTests.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CB5431028EE94C100110E0D /* HTMLRegressionTests.m */; };
151+
1CB5431228EE94C100110E0D /* HTMLRegressionTests.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CB5431028EE94C100110E0D /* HTMLRegressionTests.m */; };
152+
1CB5431328EE94C100110E0D /* HTMLRegressionTests.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CB5431028EE94C100110E0D /* HTMLRegressionTests.m */; };
150153
1CBACD8F1A17A5A90016908D /* HTMLComment.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CA5C21518D746D600147FE7 /* HTMLComment.m */; };
151154
1CBACD901A17A5A90016908D /* HTMLDocument.m in Sources */ = {isa = PBXBuildFile; fileRef = 1C25D3A7177BB78600F7C10D /* HTMLDocument.m */; };
152155
1CBACD911A17A5A90016908D /* HTMLDocumentType.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CA5C21A18D7479C00147FE7 /* HTMLDocumentType.m */; };
@@ -300,6 +303,7 @@
300303
1CACE9E91783AA6600754A8F /* HTMLString.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = HTMLString.m; sourceTree = "<group>"; };
301304
1CB0B95D183F2C7100021DBE /* HTMLPreprocessedInputStream.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = HTMLPreprocessedInputStream.h; sourceTree = "<group>"; };
302305
1CB0B95E183F2C7100021DBE /* HTMLPreprocessedInputStream.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = HTMLPreprocessedInputStream.m; sourceTree = "<group>"; };
306+
1CB5431028EE94C100110E0D /* HTMLRegressionTests.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = HTMLRegressionTests.m; sourceTree = "<group>"; };
303307
1CB61D2417BB671A00EE9653 /* HTMLReader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = HTMLReader.h; path = include/HTMLReader.h; sourceTree = "<group>"; };
304308
1CB61D2717BB68F100EE9653 /* LICENSE */ = {isa = PBXFileReference; lastKnownFileType = text; path = LICENSE; sourceTree = "<group>"; };
305309
1CB61D2817BB7A2700EE9653 /* HTMLReader.podspec */ = {isa = PBXFileReference; lastKnownFileType = text; path = HTMLReader.podspec; sourceTree = "<group>"; };
@@ -476,6 +480,7 @@
476480
1C9513C21A8029CC00BB2CC9 /* HTMLEncodingTests.m */,
477481
1C8E105D1919F27A0010007B /* HTMLEscapingTest.m */,
478482
1CD524FD18DB51E6003F46A3 /* HTMLNodeTests.m */,
483+
1CB5431028EE94C100110E0D /* HTMLRegressionTests.m */,
479484
83C4518C17BB1FA400C144DF /* HTMLSelectorTests.m */,
480485
1CF4584117CC83DD000F64B5 /* HTMLSerializerTests.m */,
481486
1CC666AF17B14E1800E457E7 /* HTMLTestUtilities.h */,
@@ -1002,6 +1007,7 @@
10021007
0D1077AD1C1AC99500CF9B41 /* HTMLTokenizerTests.m in Sources */,
10031008
0D1077891C1AC40500CF9B41 /* HTMLNodeTests.m in Sources */,
10041009
0D10778B1C1AC40500CF9B41 /* HTMLSerializerTests.m in Sources */,
1010+
1CB5431328EE94C100110E0D /* HTMLRegressionTests.m in Sources */,
10051011
0D10778E1C1AC40500CF9B41 /* HTMLTreeConstructionTests.m in Sources */,
10061012
0D10778A1C1AC40500CF9B41 /* HTMLSelectorTests.m in Sources */,
10071013
0D10778C1C1AC40500CF9B41 /* HTMLTestUtilities.m in Sources */,
@@ -1093,6 +1099,7 @@
10931099
1C9513C41A8029CC00BB2CC9 /* HTMLEncodingTests.m in Sources */,
10941100
1C88297318369F320051653C /* HTMLTestUtilities.m in Sources */,
10951101
1C88297418369F320051653C /* HTMLTokenizerTests.m in Sources */,
1102+
1CB5431228EE94C100110E0D /* HTMLRegressionTests.m in Sources */,
10961103
1C88297518369F320051653C /* HTMLTreeConstructionTests.m in Sources */,
10971104
1C88297618369F320051653C /* HTMLTreeEnumeratorTests.m in Sources */,
10981105
);
@@ -1151,6 +1158,7 @@
11511158
1C9513C31A8029CC00BB2CC9 /* HTMLEncodingTests.m in Sources */,
11521159
1CC666B117B14E1800E457E7 /* HTMLTestUtilities.m in Sources */,
11531160
1CC666A917B0C71100E457E7 /* HTMLTokenizerTests.m in Sources */,
1161+
1CB5431128EE94C100110E0D /* HTMLRegressionTests.m in Sources */,
11541162
1CC666AD17B14D2B00E457E7 /* HTMLTreeConstructionTests.m in Sources */,
11551163
1CC666AB17B0C82200E457E7 /* HTMLTreeEnumeratorTests.m in Sources */,
11561164
);

HTMLReaderTests/HTMLRegressionTests.m

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// HTMLRegressionTests.m
2+
//
3+
// Public domain. https://github.com/nolanw/HTMLReader
4+
5+
#import "HTMLTestUtilities.h"
6+
#import "HTMLDocument.h"
7+
8+
@interface HTMLRegressionTests : XCTestCase
9+
10+
@end
11+
12+
@implementation HTMLRegressionTests
13+
14+
- (void)testIssue95
15+
{
16+
// https://github.com/nolanw/HTMLReader/issues/95
17+
// Reduced from http://thegreatstory.org/MD-writings.html on 2022-10-06
18+
// Test is to not crash :)
19+
[HTMLDocument documentWithString:@
20+
"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">"
21+
"<a>"
22+
"<font>"
23+
"<font>"
24+
"<font>"
25+
"<font color>"
26+
"<font size>"
27+
"<p>"
28+
"<a></a>"
29+
];
30+
}
31+
32+
@end

Sources/HTMLParser.m

+50-22
Original file line numberDiff line numberDiff line change
@@ -1164,11 +1164,12 @@ - (BOOL)runAdoptionAgencyAlgorithmForTagName:(NSString *)tagName
11641164
[self addParseError:@"Adoption agency formatting element not current"];
11651165
}
11661166
HTMLElement *furthestBlock;
1167-
for (NSUInteger i = [_stackOfOpenElements indexOfObject:formattingElement] + 1;
1168-
i < _stackOfOpenElements.count; i++)
1167+
for (NSUInteger i = [_stackOfOpenElements indexOfObject:formattingElement] + 1, end = _stackOfOpenElements.count;
1168+
i < end; i++)
11691169
{
1170-
if (IsSpecialElement([_stackOfOpenElements objectAtIndex:i])) {
1171-
furthestBlock = [_stackOfOpenElements objectAtIndex:i];
1170+
HTMLElement *element = [_stackOfOpenElements objectAtIndex:i];
1171+
if (IsSpecialElement(element)) {
1172+
furthestBlock = element;
11721173
break;
11731174
}
11741175
}
@@ -1181,13 +1182,17 @@ - (BOOL)runAdoptionAgencyAlgorithmForTagName:(NSString *)tagName
11811182
return YES;
11821183
}
11831184
HTMLElement *commonAncestor = [_stackOfOpenElements objectAtIndex:[_stackOfOpenElements indexOfObject:formattingElement] - 1];
1184-
NSUInteger bookmark = [_activeFormattingElements indexOfObject:formattingElement];
1185+
HTMLElement *beforeBookmark, *afterBookmark; {
1186+
NSUInteger bookmark = [_activeFormattingElements indexOfObject:formattingElement];
1187+
if (bookmark > 0) beforeBookmark = [_activeFormattingElements objectAtIndex:bookmark - 1];
1188+
if ((bookmark + 1) < _activeFormattingElements.count) afterBookmark = [_activeFormattingElements objectAtIndex:bookmark + 1];
1189+
}
11851190
HTMLElement *node = furthestBlock, *lastNode = furthestBlock;
11861191
NSUInteger nodeIndex = [_stackOfOpenElements indexOfObject:node];
11871192
NSInteger innerLoopCounter = 0;
11881193
while (YES) {
11891194
innerLoopCounter += 1;
1190-
1195+
11911196
nodeIndex -= 1;
11921197
node = [_stackOfOpenElements objectAtIndex:nodeIndex];
11931198

@@ -1201,16 +1206,23 @@ - (BOOL)runAdoptionAgencyAlgorithmForTagName:(NSString *)tagName
12011206
[_stackOfOpenElements removeObject:node];
12021207
continue;
12031208
}
1204-
1205-
HTMLElement *clone = [node copy];
1206-
[_activeFormattingElements replaceObjectAtIndex:[_activeFormattingElements indexOfObject:node]
1207-
withObject:clone];
1208-
[_stackOfOpenElements replaceObjectAtIndex:[_stackOfOpenElements indexOfObject:node]
1209-
withObject:clone];
1210-
node = clone;
1211-
1209+
1210+
{
1211+
HTMLElement *clone = [node copy];
1212+
[_activeFormattingElements replaceObjectAtIndex:[_activeFormattingElements indexOfObject:node]
1213+
withObject:clone];
1214+
if (beforeBookmark == node) beforeBookmark = clone;
1215+
if (afterBookmark == node) afterBookmark = clone;
1216+
[_stackOfOpenElements replaceObjectAtIndex:[_stackOfOpenElements indexOfObject:node]
1217+
withObject:clone];
1218+
node = clone;
1219+
}
1220+
12121221
if ([lastNode isEqual:furthestBlock]) {
1213-
bookmark = [_activeFormattingElements indexOfObject:node];
1222+
// "move the aforementioned bookmark to be immediately after the new node" -> new node is the before-bookmark
1223+
beforeBookmark = node;
1224+
NSUInteger bookmark = [_activeFormattingElements indexOfObject:node];
1225+
afterBookmark = ((bookmark + 1) < _activeFormattingElements.count) ? [_activeFormattingElements objectAtIndex:bookmark + 1] : nil;
12141226
}
12151227

12161228
[[node mutableChildren] addObject:lastNode];
@@ -1225,14 +1237,27 @@ - (BOOL)runAdoptionAgencyAlgorithmForTagName:(NSString *)tagName
12251237
[formattingClone.mutableChildren addObjectsFromArray:furthestBlock.children.array];
12261238

12271239
[furthestBlock.mutableChildren addObject:formattingClone];
1228-
1229-
// TODO: Explain why this is necessary.
1230-
if ([_activeFormattingElements indexOfObject:formattingElement] < bookmark) {
1231-
bookmark--;
1232-
}
1233-
1240+
12341241
[self removeElementFromListOfActiveFormattingElements:formattingElement];
1235-
[_activeFormattingElements insertObject:formattingClone atIndex:bookmark];
1242+
NSUInteger proposedBookmark = NSNotFound;
1243+
if (_activeFormattingElements.count == 0) {
1244+
proposedBookmark = 0;
1245+
}
1246+
if (proposedBookmark == NSNotFound) {
1247+
NSUInteger beforeIndex = beforeBookmark ? [_activeFormattingElements indexOfObject:beforeBookmark] : NSNotFound;
1248+
if (beforeIndex != NSNotFound) {
1249+
proposedBookmark = beforeIndex + 1;
1250+
}
1251+
}
1252+
if (proposedBookmark == NSNotFound) {
1253+
NSUInteger afterIndex = afterBookmark ? [_activeFormattingElements indexOfObject:afterBookmark] : NSNotFound;
1254+
if (afterIndex != NSNotFound) {
1255+
proposedBookmark = afterIndex;
1256+
}
1257+
}
1258+
NSAssert(proposedBookmark != NSNotFound, @"Adoption agency algorithm bookmark not found in the list of active formatting elements");
1259+
[_activeFormattingElements insertObject:formattingClone
1260+
atIndex:proposedBookmark];
12361261

12371262
[_stackOfOpenElements removeObject:formattingElement];
12381263
[_stackOfOpenElements insertObject:formattingClone
@@ -2614,6 +2639,7 @@ - (void)processToken:(id)token usingRulesForInsertionMode:(HTMLInsertionMode)ins
26142639
return [self inBodyInsertionModeHandleStartTagToken:token];
26152640
} else {
26162641
NSAssert(NO, @"invalid %@ in in body insertion mode", [token class]);
2642+
break;
26172643
}
26182644

26192645
case HTMLTextInsertionMode:
@@ -2625,6 +2651,7 @@ - (void)processToken:(id)token usingRulesForInsertionMode:(HTMLInsertionMode)ins
26252651
return [self textInsertionModeHandleEOFToken:token];
26262652
} else {
26272653
NSAssert(NO, @"invalid %@ in text insertion mode", [token class]);
2654+
break;
26282655
}
26292656

26302657
case HTMLInTableInsertionMode:
@@ -2824,6 +2851,7 @@ - (void)processToken:(id)token usingRulesForInsertionMode:(HTMLInsertionMode)ins
28242851
return [self foreignContentInsertionModeHandleStartTagToken:token];
28252852
} else {
28262853
NSAssert(NO, @"invalid %@ in foreign content insertion mode", [token class]);
2854+
break;
28272855
}
28282856

28292857
default:

Sources/HTMLTokenizer.m

+1
Original file line numberDiff line numberDiff line change
@@ -2152,6 +2152,7 @@ - (void)resume
21522152
return [self CDATASectionState];
21532153
default:
21542154
NSAssert(NO, @"unexpected state %ld", (long)self.state);
2155+
break;
21552156
}
21562157
}
21572158

0 commit comments

Comments
 (0)