Skip to content

Commit d469398

Browse files
committed
CHANGE: bump jwarc dependency from 0.20.0 -> 0.21.0. Print parsing and extracting errors instead of stopping the program.
1 parent 22ce7df commit d469398

File tree

2 files changed

+33
-18
lines changed

2 files changed

+33
-18
lines changed

pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
<dependency>
6868
<groupId>org.netpreserve</groupId>
6969
<artifactId>jwarc</artifactId>
70-
<version>0.20.0</version>
70+
<version>0.21.0</version>
7171
</dependency>
7272
<dependency>
7373
<groupId>org.netpreserve</groupId>

src/org/netpreserve/warc2html/Warc2Html.java

+32-17
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,20 @@
88
import org.netpreserve.jwarc.WarcReader;
99
import org.netpreserve.jwarc.WarcRecord;
1010
import org.netpreserve.jwarc.WarcResponse;
11+
import org.netpreserve.jwarc.ParsingException;
1112
import org.netpreserve.urlcanon.Canonicalizer;
1213
import org.netpreserve.urlcanon.ParsedUrl;
1314

1415
import java.io.*;
16+
import java.lang.IllegalArgumentException;
1517
import java.net.HttpURLConnection;
1618
import java.net.URI;
1719
import java.net.URL;
1820
import java.nio.channels.FileChannel;
1921
import java.nio.file.Files;
2022
import java.nio.file.Path;
2123
import java.nio.file.Paths;
24+
import java.nio.file.FileSystemException;
2225
import java.time.Instant;
2326
import java.time.format.DateTimeFormatter;
2427
import java.util.*;
@@ -216,32 +219,44 @@ public void writeTo(Path outDir) throws IOException {
216219
try (var filelist = Files.newBufferedWriter(outDir.resolve("filelist.txt"))) {
217220
for (Resource resource : resourcesByPath.values()) {
218221
try (WarcReader reader = openWarc(resource.warc, resource.offset, resource.length)) {
219-
WarcRecord record = reader.next().orElseThrow();
222+
WarcRecord record;
223+
try {
224+
record = reader.next().orElseThrow();
225+
} catch (ParsingException e) {
226+
System.out.println("Failed to parse record, skipping record and contining to next record.");
227+
continue;
228+
}
220229
if (!(record instanceof WarcResponse)) throw new IllegalStateException();
221230
WarcResponse response = (WarcResponse) record;
222231

223232
Path path = outDir.resolve(resource.path);
224233
Files.createDirectories(path.getParent());
225234

226235
long linksRewritten = 0;
227-
try (OutputStream output = Files.newOutputStream(path)) {
228-
InputStream input = response.http().body().stream();
229-
if (resource.isRedirect()) {
230-
String destination = rewriteLink(resource.locationHeader, URI.create(resource.url), resource.path);
231-
if (destination == null) destination = resource.locationHeader;
232-
output.write(("<meta http-equiv=\"refresh\" content=\"0; url=" + destination + "\">\n").getBytes(UTF_8));
233-
} else if (resource.type.equals("text/html")) {
234-
URI baseUri = URI.create(resource.url);
235-
linksRewritten = LinkRewriter.rewriteHTML(input, output, url -> rewriteLink(url, baseUri, resource.path));
236-
} else {
237-
input.transferTo(output);
236+
try {
237+
try (OutputStream output = Files.newOutputStream(path)) {
238+
InputStream input = response.http().body().stream();
239+
if (resource.isRedirect()) {
240+
String destination = rewriteLink(resource.locationHeader, URI.create(resource.url), resource.path);
241+
if (destination == null) destination = resource.locationHeader;
242+
output.write(("<meta http-equiv=\"refresh\" content=\"0; url=" + destination + "\">\n").getBytes(UTF_8));
243+
} else if (resource.type.equals("text/html")) {
244+
URI baseUri = URI.create(resource.url);
245+
linksRewritten = LinkRewriter.rewriteHTML(input, output, url -> rewriteLink(url, baseUri, resource.path));
246+
} else {
247+
input.transferTo(output);
248+
}
238249
}
239-
}
240250

241-
System.out.println(resource.path + " " + resource.url + " " + resource.type + " " + linksRewritten);
242-
filelist.write(resource.path + " " + ARC_DATE_FORMAT.format(resource.instant) + " " + resource.url +
243-
" " + resource.type + " " + resource.status + " " +
244-
(resource.locationHeader == null ? "-" : resource.locationHeader) + "\r\n");
251+
System.out.println(resource.path + " " + resource.url + " " + resource.type + " " + linksRewritten);
252+
filelist.write(resource.path + " " + ARC_DATE_FORMAT.format(resource.instant) + " " + resource.url +
253+
" " + resource.type + " " + resource.status + " " +
254+
(resource.locationHeader == null ? "-" : resource.locationHeader) + "\r\n");
255+
} catch (FileSystemException e) {
256+
System.out.println("ERROR: File name too long, will not extract:" + resource.path + " " + resource.url + " " + resource.type);
257+
} catch (IllegalArgumentException e) {
258+
System.out.println("ERROR: Illegal character in path, will not extract:" + resource.path + " " + resource.url + " " + resource.type);
259+
}
245260
}
246261
}
247262
}

0 commit comments

Comments
 (0)