Skip to content

Commit d47fe93

Browse files
authored
Merge pull request #72 from alexander-nitsche/feature-handle-utf8-in-html
FEATURE: Handle UTF-8 in HTML
2 parents c27d361 + ef861cb commit d47fe93

8 files changed

+240
-8
lines changed

README.md

+11-2
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,10 @@ $syllable->setHyphen(new Hyphen\Dash());
7676
// By default, all words are hyphenated.
7777
$syllable->setMinWordLength(5);
7878

79-
// Output hyphenated text.
79+
// Output hyphenated text ..
8080
echo $syllable->hyphenateText('Provide your own paragraphs...');
81+
// .. or hyphenated HTML.
82+
echo $syllable->hyphenateHtmlText('<b>... with highlighted text.</b>');
8183
```
8284

8385
See the [demo.php](demo.php) file for a working example.
@@ -137,7 +139,7 @@ Words need to contain at least this many character to be hyphenated.
137139
#### public setLibxmlOptions(int $libxmlOptions)
138140

139141
Options to use for HTML parsing by libxml.
140-
See https://www.php.net/manual/de/libxml.constants.php.
142+
**See:** https://www.php.net/manual/de/libxml.constants.php.
141143

142144
#### public excludeAll()
143145

@@ -194,6 +196,13 @@ Hyphenate all words in the plain text.
194196

195197
Hyphenate all readable text in the HTML, excluding HTML tags and
196198
attributes.
199+
**Deprecated:** Use the UTF-8 capable hyphenateHtmlText() instead. This method is kept only for backward compatibility and will be removed in the next major version 2.0.
200+
201+
#### public hyphenateHtmlText(string $html): string
202+
203+
Hyphenate all readable text in the HTML, excluding HTML tags and
204+
attributes.
205+
This method is UTF-8 capable and should be preferred over hyphenateHtml().
197206

198207
#### public histogramText(string $text): array
199208

build/classes/Reflection.php

+5-2
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,11 @@ protected function parse($class)
114114
$annotation = explode(' ', $line);
115115
$returnType = $annotation[1] !== 'void' ? $annotation[1] : '';
116116
} elseif (strpos($line, '@see') === 0) {
117-
$annotation = explode(' ', $line);
118-
$commentLines[] = 'See '.$annotation[1].'.';
117+
$annotation = explode(' ', $line, 2);
118+
$commentLines[] = '**See:** '.rtrim($annotation[1], '.').'.';
119+
} elseif (strpos($line, '@deprecated') === 0) {
120+
$annotation = explode(' ', $line, 2);
121+
$commentLines[] = '**Deprecated:** '.rtrim($annotation[1], '.').'.';
119122
} elseif (!empty($line)) {
120123
$commentLines[] = $line;
121124
}

composer.json

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
"require": {
2525
"php": ">=5.6",
2626
"ext-json": "*",
27+
"ext-libxml": "*",
2728
"ext-mbstring": "*",
2829
"ext-dom": "*"
2930
},

src/Syllable.php

+68
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,8 @@ public function hyphenateText($text)
500500
* Hyphenate all readable text in the HTML, excluding HTML tags and
501501
* attributes.
502502
*
503+
* @deprecated Use the UTF-8 capable hyphenateHtmlText() instead. This method is kept only for backward compatibility and will be removed in the next major version 2.0.
504+
*
503505
* @param string $html
504506
*
505507
* @return string
@@ -520,6 +522,72 @@ public function hyphenateHtml($html)
520522
return $dom->saveHTML();
521523
}
522524

525+
/**
526+
* Hyphenate all readable text in the HTML, excluding HTML tags and
527+
* attributes.
528+
*
529+
* This method is UTF-8 capable and should be preferred over hyphenateHtml().
530+
*
531+
* @param string $html
532+
*
533+
* @return string
534+
*/
535+
public function hyphenateHtmlText($html)
536+
{
537+
$charset = mb_detect_encoding($html);
538+
list($bodyContent, $beforeBodyContent, $afterBodyContent) = $this->parseHtmlText($html);
539+
$html = "<!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.0 Transitional//EN' 'http://www.w3.org/TR/REC-html40/loose.dtd'>".
540+
'<html>'.
541+
'<head>'.
542+
"<meta http-equiv='content-type' content='text/html; charset=$charset'>".
543+
'</head>'.
544+
"<body>$bodyContent</body>".
545+
'</html>';
546+
547+
$dom = new DOMDocument();
548+
$dom->resolveExternals = true;
549+
$dom->loadHTML($html, $this->libxmlOptions);
550+
551+
// filter excludes
552+
$xpath = new DOMXPath($dom);
553+
$excludedNodes = $this->excludes ? $xpath->query(join('|', $this->excludes)) : null;
554+
$includedNodes = $this->includes ? $xpath->query(join('|', $this->includes)) : null;
555+
556+
$this->hyphenateHtmlDom($dom, $excludedNodes, $includedNodes);
557+
558+
$hyphenatedBodyContent = $dom->saveHTML($dom->getElementsByTagName('body')->item(0));
559+
$hyphenatedBodyContent = mb_substr($hyphenatedBodyContent, mb_strlen('<body>'), -mb_strlen('</body>'));
560+
$hyphenatedHtml = $beforeBodyContent.$hyphenatedBodyContent.$afterBodyContent;
561+
562+
return $hyphenatedHtml;
563+
}
564+
565+
/**
566+
* @param string $html
567+
*
568+
* @return array
569+
*/
570+
private function parseHtmlText($html)
571+
{
572+
if (($bodyContentEnd = mb_strrpos($html, '</body>')) !== false) {
573+
$bodyContentStart = mb_strpos($html, '<body');
574+
$bodyContentStart = $bodyContentStart + strcspn($html, '>', $bodyContentStart) + 1;
575+
$beforeBodyContent = mb_substr($html, 0, $bodyContentStart);
576+
$afterBodyContent = mb_substr($html, $bodyContentEnd);
577+
$bodyContent = mb_substr($html, $bodyContentStart, -mb_strlen($afterBodyContent));
578+
} else {
579+
$beforeBodyContent = '';
580+
$afterBodyContent = '';
581+
$bodyContent = $html;
582+
}
583+
584+
return [
585+
$bodyContent,
586+
$beforeBodyContent,
587+
$afterBodyContent,
588+
];
589+
}
590+
523591
/**
524592
* Add hyphenation to the DOM nodes.
525593
*

tests/build/DocumentationManagerTest.php

+6-1
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,15 @@ public function delegateSucceeds()
6464
you will not use any other functions. Browse the code under src/ for all
6565
available functions.
6666
67+
#### public setMethodsDeprecated(array $methods = [])
68+
69+
The deprecated public setter method.
70+
**Deprecated:** Use setMethods() instead.
71+
6772
#### public setMethods(array $methods = [])
6873
6974
The public setter method.
70-
See https://github.com/vanderlee/phpSyllable/blob/master/tests/build/ReflectionFixture.php.
75+
**See:** https://github.com/vanderlee/phpSyllable/blob/master/tests/build/ReflectionFixture.php.
7176
7277
#### public getMethods(): array
7378

tests/build/ReflectionFixture.php

+14
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,20 @@ class ReflectionFixture
1414
*/
1515
protected static $parameters;
1616

17+
/**
18+
* The deprecated public setter method.
19+
*
20+
* @param array $methods
21+
*
22+
* @return void
23+
*
24+
* @deprecated Use setMethods() instead.
25+
*/
26+
public function setMethodsDeprecated($methods = [])
27+
{
28+
$this->methods = $methods;
29+
}
30+
1731
/**
1832
* The public setter method.
1933
*

tests/build/ReflectionTest.php

+5-1
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,13 @@ public function getPublicMethodsWithSignatureAndComment()
3232
$class = ReflectionFixture::class;
3333

3434
$expected = [
35+
[
36+
'signature' => 'public setMethodsDeprecated(array $methods = [])',
37+
'comment' => "The deprecated public setter method.\n**Deprecated:** Use setMethods() instead.",
38+
],
3539
[
3640
'signature' => 'public setMethods(array $methods = [])',
37-
'comment' => "The public setter method.\nSee https://github.com/vanderlee/phpSyllable/blob/master/tests/build/ReflectionFixture.php.",
41+
'comment' => "The public setter method.\n**See:** https://github.com/vanderlee/phpSyllable/blob/master/tests/build/ReflectionFixture.php.",
3842
],
3943
[
4044
'signature' => 'public getMethods(): array',

tests/src/SyllableTest.php

+130-2
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,64 @@ public function testHyphenateHtml()
573573
."\n", $this->object->hyphenateHtml('Ridiculously <b class="unsplittable">complicated</b> metatext'));
574574
}
575575

576+
/**
577+
* @return array[]
578+
*/
579+
public function dataHyphenateHtmlText()
580+
{
581+
return [
582+
[
583+
'Ridiculously <b class="unsplittable">complicated</b> metatext — with dash entity.',
584+
'Ridicu-lous-ly <b class="unsplittable">com-pli-cat-ed</b> meta-text — with dash en-ti-ty.',
585+
],
586+
[
587+
'<html>'.
588+
'<body>'.
589+
'Ridiculously <b class="unsplittable">complicated</b> metatext — with dash entity.'.
590+
'</body>'.
591+
'</html>',
592+
'<html>'.
593+
'<body>'.
594+
'Ridicu-lous-ly <b class="unsplittable">com-pli-cat-ed</b> meta-text — with dash en-ti-ty.'.
595+
'</body>'.
596+
'</html>',
597+
],
598+
[
599+
'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'.
600+
'<html>'.
601+
'<body class="body-class">'.
602+
'Ridiculously <b class="unsplittable">complicated</b> metatext — with dash entity.'.
603+
'</body>'.
604+
'</html>',
605+
'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'.
606+
'<html>'.
607+
'<body class="body-class">'.
608+
'Ridicu-lous-ly <b class="unsplittable">com-pli-cat-ed</b> meta-text — with dash en-ti-ty.'.
609+
'</body>'.
610+
'</html>',
611+
],
612+
];
613+
}
614+
615+
/**
616+
* @dataProvider dataHyphenateHtmlText
617+
*
618+
* @return void
619+
*/
620+
public function testHyphenateHtmlText($html, $expected)
621+
{
622+
$this->object->setHyphen('-');
623+
624+
// Test that incoming content is never wrapped with the implicit doctype or
625+
// html and body tag of DOMDocument. It always behaves as if LIBXML_HTML_NOIMPLIED
626+
// and LIBXML_HTML_NODEFDTD are set.
627+
$this->object->setLibxmlOptions(0);
628+
$this->assertEquals($expected, $this->object->hyphenateHtmlText($html));
629+
630+
$this->object->setLibxmlOptions(LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
631+
$this->assertEquals($expected, $this->object->hyphenateHtmlText($html));
632+
}
633+
576634
/**
577635
* @return void
578636
*/
@@ -647,6 +705,13 @@ public function testExcludeElement()
647705
."\n",
648706
$this->object->hyphenateHtml('Ridiculously <b class="unsplittable">complicated</b> metatext <i>extravaganza</i>')
649707
);
708+
$this->assertEquals(
709+
'Ridicu-lous-ly <b class="unsplittable">complicated</b> meta-text <i>ex-trav-a-gan-za</i>',
710+
// Old libxml versions of PHP < 7.4 occasionally added a line break in the output of
711+
// \DOMDocument::saveHTML(). These line breaks are not yet handled in the Syllable
712+
// implementation, but only quick and dirty in these tests by removing the trailing line breaks.
713+
rtrim($this->object->hyphenateHtmlText('Ridiculously <b class="unsplittable">complicated</b> metatext <i>extravaganza</i>'))
714+
);
650715
}
651716

652717
/**
@@ -664,6 +729,13 @@ public function testExcludeElements()
664729
."\n",
665730
$this->object->hyphenateHtml('Ridiculously <b class="unsplittable">complicated</b> metatext <i>extravaganza</i>')
666731
);
732+
$this->assertEquals(
733+
'Ridicu-lous-ly <b class="unsplittable">complicated</b> meta-text <i>extravaganza</i>',
734+
// Old libxml versions of PHP < 7.4 occasionally added a line break in the output of
735+
// \DOMDocument::saveHTML(). These line breaks are not yet handled in the Syllable
736+
// implementation, but only quick and dirty in these tests by removing the trailing line breaks.
737+
rtrim($this->object->hyphenateHtmlText('Ridiculously <b class="unsplittable">complicated</b> metatext <i>extravaganza</i>'))
738+
);
667739
}
668740

669741
/**
@@ -682,6 +754,13 @@ public function testExcludeAllAndInclude()
682754
."\n",
683755
$this->object->hyphenateHtml('Ridiculously <b class="unsplittable">complicated</b> metatext <i>extravaganza</i>')
684756
);
757+
$this->assertEquals(
758+
'Ridiculously <b class="unsplittable">com-pli-cat-ed</b> metatext <i>extravaganza</i>',
759+
// Old libxml versions of PHP < 7.4 occasionally added a line break in the output of
760+
// \DOMDocument::saveHTML(). These line breaks are not yet handled in the Syllable
761+
// implementation, but only quick and dirty in these tests by removing the trailing line breaks.
762+
rtrim($this->object->hyphenateHtmlText('Ridiculously <b class="unsplittable">complicated</b> metatext <i>extravaganza</i>'))
763+
);
685764
}
686765

687766
/**
@@ -696,9 +775,16 @@ public function testExcludeAndInclude()
696775
// Do not Hypenate content within <b>
697776
$this->assertEquals(
698777
'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
699-
."\n".'<html><body><p>Ridicu-lous-ly <b class="unsplittable">complicated <i>ex-trav-a-gan-za</i></b> meta-text</p></body></html>'
778+
."\n".'<html><body><p>Ridicu-lous-ly <b class="unsplittable">complicated </b> meta-text <i>ex-trav-a-gan-za</i></p></body></html>'
700779
."\n",
701-
$this->object->hyphenateHtml('Ridiculously <b class="unsplittable">complicated <i>extravaganza</i></b> metatext')
780+
$this->object->hyphenateHtml('Ridiculously <b class="unsplittable">complicated </b> metatext <i>extravaganza</i>')
781+
);
782+
$this->assertEquals(
783+
'Ridicu-lous-ly <b class="unsplittable">complicated </b> meta-text <i>ex-trav-a-gan-za</i>',
784+
// Old libxml versions of PHP < 7.4 occasionally added a line break in the output of
785+
// \DOMDocument::saveHTML(). These line breaks are not yet handled in the Syllable
786+
// implementation, but only quick and dirty in these tests by removing the trailing line breaks.
787+
rtrim($this->object->hyphenateHtmlText('Ridiculously <b class="unsplittable">complicated </b> metatext <i>extravaganza</i>'))
702788
);
703789
}
704790

@@ -717,6 +803,13 @@ public function testExcludeAttribute()
717803
."\n",
718804
$this->object->hyphenateHtml('Ridiculously <b class="unsplittable">complicated</b> metatext <i>extravaganza</i>')
719805
);
806+
$this->assertEquals(
807+
'Ridicu-lous-ly <b class="unsplittable">complicated</b> meta-text <i>ex-trav-a-gan-za</i>',
808+
// Old libxml versions of PHP < 7.4 occasionally added a line break in the output of
809+
// \DOMDocument::saveHTML(). These line breaks are not yet handled in the Syllable
810+
// implementation, but only quick and dirty in these tests by removing the trailing line breaks.
811+
rtrim($this->object->hyphenateHtmlText('Ridiculously <b class="unsplittable">complicated</b> metatext <i>extravaganza</i>'))
812+
);
720813
}
721814

722815
/**
@@ -734,5 +827,40 @@ public function testExcludeAttributeValue()
734827
."\n",
735828
$this->object->hyphenateHtml('Ridiculously <b class="unsplittable">complicated</b> metatext <i class="go right ahead">extravaganza</i>')
736829
);
830+
$this->assertEquals(
831+
'Ridicu-lous-ly <b class="unsplittable">complicated</b> meta-text <i class="go right ahead">ex-trav-a-gan-za</i>',
832+
// Old libxml versions of PHP < 7.4 occasionally added a line break in the output of
833+
// \DOMDocument::saveHTML(). These line breaks are not yet handled in the Syllable
834+
// implementation, but only quick and dirty in these tests by removing the trailing line breaks.
835+
rtrim($this->object->hyphenateHtmlText('Ridiculously <b class="unsplittable">complicated</b> metatext <i class="go right ahead">extravaganza</i>'))
836+
);
837+
}
838+
839+
/**
840+
* @return void
841+
*/
842+
public function testUtf8Characters()
843+
{
844+
$this->object->setHyphen('-');
845+
846+
$this->object->setLanguage('de');
847+
$this->assertEquals(
848+
'Äu-ßerst kom-pli-zier-ter Me-ta-text.',
849+
$this->object->hyphenateText('Äußerst komplizierter Metatext.')
850+
);
851+
$this->assertEquals(
852+
'Äu-ßerst <b class="unsplittable">kom-pli-zier-ter</b> Me-ta-text.',
853+
$this->object->hyphenateHtmlText('Äußerst <b class="unsplittable">komplizierter</b> Metatext.')
854+
);
855+
856+
$this->object->setLanguage('uk');
857+
$this->assertEquals(
858+
'Над-зви-чайно скла-дний ме-та-те-кст.',
859+
$this->object->hyphenateText('Надзвичайно складний метатекст.')
860+
);
861+
$this->assertEquals(
862+
'Над-зви-чайно <b class="unsplittable">скла-дний</b> ме-та-те-кст.',
863+
$this->object->hyphenateHtmlText('Надзвичайно <b class="unsplittable">складний</b> метатекст.')
864+
);
737865
}
738866
}

0 commit comments

Comments
 (0)