Skip to content

Commit ce434c1

Browse files
GreyWyvernk00ni
andauthored
Add font fallback + Support for font IDs containing hyphens (#614)
* Add font choice fallback + fonts w/ hyphen If a text stream is "decoded" and contains UTF-8 control characters, it probably wasn't decoded using the proper font code page. Add a loop that cycles through all the available fonts to see if there's a better decode choice. Resolves Issue 586. As well, add the ability to parse font IDs containing dashes (-). Resolves Issue 145 * Update PDFObjectTest.php Simplify these tests in case future edits change spacing rules. * Refactor duplicate code into a function * Use single quoted regexp Let PCRE handle the conversion rather than PHP. Hopefully fixes PHPStan complaints about null byte. * Add @param for $command and ?Page * Proper indentation. * fixing coding style issues in PDFObject.php ref: https://cs.symfony.com/doc/rules/function_notation/nullable_type_declaration_for_default_null_value.html * reverted coding style adaptions * Remove test case PDF Remove the Font ID with hyphen test case PDF as we could not contact the submitter to get permission to use it. Change the unit test to directly test if a Font ID with a hyphen is correctly parsed. * Add one extra test for font-fallback Add one more test for font-fallback. This addition also resolves #495. Catches situations where a null byte \x00 may not be found by preg_match in a unicode context. Null bytes in the text string usually means that a CIDMap encoded string has been passed through as UTF-8 bytes without being translated by any matching CIDMap pairs. --------- Co-authored-by: Konrad Abicht <[email protected]>
1 parent 5c82748 commit ce434c1

File tree

4 files changed

+80
-7
lines changed

4 files changed

+80
-7
lines changed

samples/ImproperFontFallback.pdf

737 KB
Binary file not shown.

src/Smalot/PdfParser/Encoding/PDFDocEncoding.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ public static function getCodePage(): array
178178
"\xfc" => "\u{00fc}", // udieresis
179179
"\xfd" => "\u{00fd}", // yacute
180180
"\xfe" => "\u{00fe}", // thorn
181-
"\xff" => "\u{00ff}", // ydieresis
181+
"\xff" => "\u{00ff}", // ydieresis
182182
];
183183
}
184184

src/Smalot/PdfParser/PDFObject.php

+45-6
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,39 @@ private function getDefaultFont(Page $page = null): Font
246246
return new Font($this->document, null, null, $this->config);
247247
}
248248

249+
/**
250+
* @param array<int,array<string,string|bool>> $command
251+
*/
252+
private function getTJUsingFontFallback(Font $font, array $command, Page $page = null): string
253+
{
254+
$orig_text = $font->decodeText($command);
255+
$text = $orig_text;
256+
257+
// If we make this a Config option, we can add a check if it's
258+
// enabled here.
259+
if (null !== $page) {
260+
$font_ids = array_keys($page->getFonts());
261+
262+
// If the decoded text contains UTF-8 control characters
263+
// then the font page being used is probably the wrong one.
264+
// Loop through the rest of the fonts to see if we can get
265+
// a good decode.
266+
while (preg_match('/[\x00-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
267+
// If we're out of font IDs, then give up and use the
268+
// original string
269+
if (0 == \count($font_ids)) {
270+
return $orig_text;
271+
}
272+
273+
// Try the next font ID
274+
$font = $page->getFont(array_shift($font_ids));
275+
$text = $font->decodeText($command);
276+
}
277+
}
278+
279+
return $text;
280+
}
281+
249282
/**
250283
* @throws \Exception
251284
*/
@@ -339,8 +372,11 @@ public function getText(Page $page = null): string
339372
$command[self::COMMAND] = [$command];
340373
// no break
341374
case 'TJ':
342-
$sub_text = $current_font->decodeText($command[self::COMMAND]);
343-
$text .= $sub_text;
375+
$text .= $this->getTJUsingFontFallback(
376+
$current_font,
377+
$command[self::COMMAND],
378+
$page
379+
);
344380
break;
345381

346382
// set leading
@@ -492,8 +528,11 @@ public function getTextArray(Page $page = null): array
492528
$command[self::COMMAND] = [$command];
493529
// no break
494530
case 'TJ':
495-
$sub_text = $current_font->decodeText($command[self::COMMAND]);
496-
$text[] = $sub_text;
531+
$text[] = $this->getTJUsingFontFallback(
532+
$current_font,
533+
$command[self::COMMAND],
534+
$page
535+
);
497536
break;
498537

499538
// set leading
@@ -592,7 +631,7 @@ public function getCommandsText(string $text_part, int &$offset = 0): array
592631
case '/':
593632
$type = $char;
594633
if (preg_match(
595-
'/\G\/([A-Z0-9\._,\+]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si',
634+
'/\G\/([A-Z0-9\._,\+-]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si',
596635
$text_part,
597636
$matches,
598637
0,
@@ -603,7 +642,7 @@ public function getCommandsText(string $text_part, int &$offset = 0): array
603642
$command = $matches[1];
604643
$offset += \strlen($matches[0]);
605644
} elseif (preg_match(
606-
'/\G\/([A-Z0-9\._,\+]+)\s+([A-Z]+)\s*/si',
645+
'/\G\/([A-Z0-9\._,\+-]+)\s+([A-Z]+)\s*/si',
607646
$text_part,
608647
$matches,
609648
0,

tests/PHPUnit/Integration/PDFObjectTest.php

+34
Original file line numberDiff line numberDiff line change
@@ -256,4 +256,38 @@ public function testReversedChars(): void
256256

257257
$this->assertStringContainsString('שלומי טסט', $pages[0]->getText());
258258
}
259+
260+
/**
261+
* Tests that a text stream with an improperly selected font code
262+
* page falls back to one that maps all characters.
263+
*
264+
* @see: https://github.com/smalot/pdfparser/issues/586
265+
*/
266+
public function testImproperFontFallback(): void
267+
{
268+
$filename = $this->rootDir.'/samples/ImproperFontFallback.pdf';
269+
270+
$parser = $this->getParserInstance();
271+
$document = $parser->parseFile($filename);
272+
$pages = $document->getPages();
273+
274+
$this->assertStringContainsString('сделал', $pages[0]->getText());
275+
}
276+
277+
/**
278+
* Tests that a font ID containing a hyphen / dash character was
279+
* correctly parsed
280+
*
281+
* @see: https://github.com/smalot/pdfparser/issues/145
282+
*/
283+
public function testFontIDWithHyphen(): void
284+
{
285+
$pdfObject = $this->getPdfObjectInstance(new Document());
286+
287+
$fontCommandHyphen = $pdfObject->getCommandsText('/FID-01 15.00 Tf');
288+
289+
$this->assertEquals('/', $fontCommandHyphen[0]['t']);
290+
$this->assertEquals('Tf', $fontCommandHyphen[0]['o']);
291+
$this->assertEquals('FID-01 15.00', $fontCommandHyphen[0]['c']);
292+
}
259293
}

0 commit comments

Comments
 (0)