18
18
19
19
import org .junit .jupiter .params .ParameterizedTest ;
20
20
import org .junit .jupiter .params .provider .CsvSource ;
21
-
21
+ import org . springframework . ai . reader . ExtractedTextFormatter ;
22
22
import static org .assertj .core .api .Assertions .assertThat ;
23
+ import static org .junit .jupiter .api .Assertions .assertFalse ;
24
+ import static org .junit .jupiter .api .Assertions .assertTrue ;
23
25
24
26
/**
25
27
* @author Christian Tzolov
28
+ * @author Shahbaz Aamir
26
29
*/
27
30
public class TikaDocumentReaderTests {
28
31
@@ -46,4 +49,26 @@ public void testDocx(String resourceUri, String resourceName, String contentSnip
46
49
assertThat (doc .getText ()).contains (contentSnipped );
47
50
}
48
51
52
+ @ ParameterizedTest
53
+ @ CsvSource ({
54
+ "classpath:/word-sample.docx,word-sample.docx,This document demonstrates the ability of the calibre DOCX Input plugin" ,
55
+ "classpath:/sample2.pdf,sample2.pdf,Robert Maron" , "classpath:/sample.ppt,sample.ppt,Sample FILE" ,
56
+ "classpath:/sample.pptx,sample.pptx,Sample FILE" })
57
+ public void testReaderWithFormatter (String resourceUri , String resourceName , String contentSnipped ) {
58
+
59
+ ExtractedTextFormatter formatter = ExtractedTextFormatter .builder ().withNumberOfTopTextLinesToDelete (5 ).build ();
60
+ var docs = new TikaDocumentReader (resourceUri , formatter ).get ();
61
+
62
+ assertThat (docs ).hasSize (1 );
63
+
64
+ var doc = docs .get (0 );
65
+
66
+ assertThat (doc .getMetadata ()).containsKeys (TikaDocumentReader .METADATA_SOURCE );
67
+ assertThat (doc .getMetadata ().get (TikaDocumentReader .METADATA_SOURCE )).isEqualTo (resourceName );
68
+ assertFalse (doc .getText ().contains (contentSnipped ));
69
+ docs = new TikaDocumentReader (resourceUri ).get ();
70
+ doc = docs .get (0 );
71
+ assertThat (doc .getText ()).contains (contentSnipped );
72
+ }
73
+
49
74
}
0 commit comments