-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmodify_pdf.pl
156 lines (134 loc) · 3.44 KB
/
modify_pdf.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/perl
use 5.010;
use Compress::Raw::Zlib;
use Digest::MD5 qw(md5 md5_hex md5_base64);
use File::Copy;
use LWP::Simple;
use MIME::Base64 qw(encode_base64url decode_base64url);
use POSIX qw(strftime);
use Socket;
use bytes;
use strict;
use warnings;
my ($infile, $outfile) = @ARGV;
die "Usage: $0 INFILE OUTFILE\n" if not $outfile;
open my $in, '<', $infile or die;
binmode $in;
my $cont = '';
while (1)
{
my $success = read $in, $cont, 100, length ($cont);
die $! if not defined $success;
last if not $success;
}
close $in;
open my $out, '>', $outfile or die;
binmode $out;
print $out $cont;
close $out;
# Decode the Tj components of the streams
sub hex_val
{
my $v = $_ [0];
if ($v =~ m/\d/) { return $v; }
if ($v eq "a") { return 10; }
if ($v eq "b") { return 11; }
if ($v eq "c") { return 12; }
if ($v eq "d") { return 13; }
if ($v eq "e") { return 14; }
if ($v eq "f") { return 15; }
return $v;
}
my $overall_text;
my $tj;
my $this_tj;
sub print_str
{
my $str = $_ [0];
my $orig_str = $str;
my $this_s;
while ($str =~ s/(.)(.)//)
{
my $a = hex_val ($1) * 16;
my $b = hex_val ($2);
#print ($a+$b, " ");
$this_s .= chr ($a+$b);
}
if ($this_s =~ m/ +[A-Z]/)
{
$this_s .= " $this_tj$orig_str<<\n";
$this_tj = " >> TJ=";
}
else
{
$this_tj .= "\n str:($this_s) $orig_str :";
}
return $this_s;
}
sub get_pdf_text
{
my $text = $_ [0];
# hh hh hh hh hh << hex based on two
while ($text =~ s/^(.*)\n//im)
{
my $line = $1;
if ($line =~ m/<([0-9a-f]+)>.*?Tj/)
{
my $str = $1;
my $this_s = print_str ($str);
$overall_text .= $this_s;
}
}
}
# Done - Decode the Tj components of the streams
# Write out the chunks of stream??
my $keep = 1;
my $cont2 = $cont;
my $keep_cont2 = $cont2;
my $stream_r = qr/^.*?FlateDecode.*?[^d]stream/s;
my $endstream_after_r = qr/endstream.*/s;
my $endstream_before_r = qr/^.*?endstream/s;
my $newline = qr/\r\n/s;
my $o;
# MAIN
while ($keep)
{
my $cont_two = 1;
while ($cont2 =~ m/[^d]stream/im)
{
$cont2 =~ s/$stream_r//;
$keep_cont2 = $cont2;
print ("\n >>> " . length ($cont2));
$cont2 =~ s/$endstream_after_r//;
print ("\n 2>>> " . length ($cont2));
$cont2 =~ s/$newline//img;
$keep_cont2 =~ s/$endstream_before_r//;
print ("\n 3>>> " . length ($keep_cont2));
# Compressed
my $outfile2 = "perl_stream.$keep.zip";
open my $out, '>', $outfile2 or die;
binmode $out;
print $out $cont2;
close $out;
# Decompressed
my $d = new Compress::Raw::Zlib::Inflate();
my $output = $d->inflate ($cont2, $o);
print " xxxx after inflate..>> $output \n";
my $outfile2 = $outfile . ".$keep.txt";
open my $out, '>', $outfile2 or die;
binmode $out;
print $out $o;
close $out;
get_pdf_text ($o);
$keep++;
$cont2 = $keep_cont2;
}
print ("$overall_text\n");
$keep = 0;
}
$cont =~ s/\W/_/img;
$cont =~ s/___*/_/img;
#print $cont;
say length($cont);
print ("$infile >> ", -s $infile, "\n");
print ("$outfile >> ", -s $outfile, "\n");