|
8 | 8 | import org.netpreserve.jwarc.WarcReader;
|
9 | 9 | import org.netpreserve.jwarc.WarcRecord;
|
10 | 10 | import org.netpreserve.jwarc.WarcResponse;
|
| 11 | +import org.netpreserve.jwarc.ParsingException; |
11 | 12 | import org.netpreserve.urlcanon.Canonicalizer;
|
12 | 13 | import org.netpreserve.urlcanon.ParsedUrl;
|
13 | 14 |
|
14 | 15 | import java.io.*;
|
| 16 | +import java.lang.IllegalArgumentException; |
15 | 17 | import java.net.HttpURLConnection;
|
16 | 18 | import java.net.URI;
|
17 | 19 | import java.net.URL;
|
18 | 20 | import java.nio.channels.FileChannel;
|
19 | 21 | import java.nio.file.Files;
|
20 | 22 | import java.nio.file.Path;
|
21 | 23 | import java.nio.file.Paths;
|
| 24 | +import java.nio.file.FileSystemException; |
22 | 25 | import java.time.Instant;
|
23 | 26 | import java.time.format.DateTimeFormatter;
|
24 | 27 | import java.util.*;
|
@@ -216,32 +219,44 @@ public void writeTo(Path outDir) throws IOException {
|
216 | 219 | try (var filelist = Files.newBufferedWriter(outDir.resolve("filelist.txt"))) {
|
217 | 220 | for (Resource resource : resourcesByPath.values()) {
|
218 | 221 | try (WarcReader reader = openWarc(resource.warc, resource.offset, resource.length)) {
|
219 |
| - WarcRecord record = reader.next().orElseThrow(); |
| 222 | + WarcRecord record; |
| 223 | + try { |
| 224 | + record = reader.next().orElseThrow(); |
| 225 | + } catch (ParsingException e) { |
| 226 | + System.out.println("Failed to parse record, skipping record and contining to next record."); |
| 227 | + continue; |
| 228 | + } |
220 | 229 | if (!(record instanceof WarcResponse)) throw new IllegalStateException();
|
221 | 230 | WarcResponse response = (WarcResponse) record;
|
222 | 231 |
|
223 | 232 | Path path = outDir.resolve(resource.path);
|
224 | 233 | Files.createDirectories(path.getParent());
|
225 | 234 |
|
226 | 235 | long linksRewritten = 0;
|
227 |
| - try (OutputStream output = Files.newOutputStream(path)) { |
228 |
| - InputStream input = response.http().body().stream(); |
229 |
| - if (resource.isRedirect()) { |
230 |
| - String destination = rewriteLink(resource.locationHeader, URI.create(resource.url), resource.path); |
231 |
| - if (destination == null) destination = resource.locationHeader; |
232 |
| - output.write(("<meta http-equiv=\"refresh\" content=\"0; url=" + destination + "\">\n").getBytes(UTF_8)); |
233 |
| - } else if (resource.type.equals("text/html")) { |
234 |
| - URI baseUri = URI.create(resource.url); |
235 |
| - linksRewritten = LinkRewriter.rewriteHTML(input, output, url -> rewriteLink(url, baseUri, resource.path)); |
236 |
| - } else { |
237 |
| - input.transferTo(output); |
| 236 | + try { |
| 237 | + try (OutputStream output = Files.newOutputStream(path)) { |
| 238 | + InputStream input = response.http().body().stream(); |
| 239 | + if (resource.isRedirect()) { |
| 240 | + String destination = rewriteLink(resource.locationHeader, URI.create(resource.url), resource.path); |
| 241 | + if (destination == null) destination = resource.locationHeader; |
| 242 | + output.write(("<meta http-equiv=\"refresh\" content=\"0; url=" + destination + "\">\n").getBytes(UTF_8)); |
| 243 | + } else if (resource.type.equals("text/html")) { |
| 244 | + URI baseUri = URI.create(resource.url); |
| 245 | + linksRewritten = LinkRewriter.rewriteHTML(input, output, url -> rewriteLink(url, baseUri, resource.path)); |
| 246 | + } else { |
| 247 | + input.transferTo(output); |
| 248 | + } |
238 | 249 | }
|
239 |
| - } |
240 | 250 |
|
241 |
| - System.out.println(resource.path + " " + resource.url + " " + resource.type + " " + linksRewritten); |
242 |
| - filelist.write(resource.path + " " + ARC_DATE_FORMAT.format(resource.instant) + " " + resource.url + |
243 |
| - " " + resource.type + " " + resource.status + " " + |
244 |
| - (resource.locationHeader == null ? "-" : resource.locationHeader) + "\r\n"); |
| 251 | + System.out.println(resource.path + " " + resource.url + " " + resource.type + " " + linksRewritten); |
| 252 | + filelist.write(resource.path + " " + ARC_DATE_FORMAT.format(resource.instant) + " " + resource.url + |
| 253 | + " " + resource.type + " " + resource.status + " " + |
| 254 | + (resource.locationHeader == null ? "-" : resource.locationHeader) + "\r\n"); |
| 255 | + } catch (FileSystemException e) { |
| 256 | + System.out.println("ERROR: File name too long, will not extract:" + resource.path + " " + resource.url + " " + resource.type); |
| 257 | + } catch (IllegalArgumentException e) { |
| 258 | + System.out.println("ERROR: Illegal character in path, will not extract:" + resource.path + " " + resource.url + " " + resource.type); |
| 259 | + } |
245 | 260 | }
|
246 | 261 | }
|
247 | 262 | }
|
|
0 commit comments