@@ -12,27 +12,47 @@ object HtmlParsing {
12
12
def extractTermAndDefinition (file : File , entity : Entity , allEntities : Set [Entity ]): Either [Error , (String , String )] =
13
13
for {
14
14
document <- JsoupBrowser ().parseFile(file.toJava).asRight
15
- doc <- extractDoc(file, document, entity)
15
+ doc <- extractDoc(file, document, entity, allEntities )
16
16
} yield (entity.name, doc)
17
17
18
- def extractDoc (file : File , document : Browser # DocumentType , entity : Entity ): Either [Error , String ] = {
18
+ def extractDoc (
19
+ file : File ,
20
+ document : Browser # DocumentType ,
21
+ entity : Entity ,
22
+ allEntities : Set [Entity ],
23
+ ): Either [Error , String ] = {
19
24
val searchQuery = s " # ${entity.entityId.map(_ + " > " ).getOrElse(" " )}div.cover > div.doc "
20
- extractTagFromDocument(file, document, searchQuery)
25
+ extractTagFromDocument(file, document, searchQuery, allEntities )
21
26
}
22
27
23
- private def extractTagFromDocument (file : File , doc : Browser # DocumentType , tag : String ): Either [Error , String ] =
24
- doc.tryExtract(element(tag)).map(_.childNodes).map(toMarkdown).toRight(ParseError (file, tag))
28
+ private def extractTagFromDocument (
29
+ file : File ,
30
+ doc : Browser # DocumentType ,
31
+ tag : String ,
32
+ allEntities : Set [Entity ],
33
+ ): Either [Error , String ] =
34
+ doc.tryExtract(element(tag)).map(_.childNodes).toRight(ParseError (file, tag)).flatMap(toMarkdown(_, allEntities))
25
35
26
- private def toMarkdown (es : Iterable [Node ]): String = {
36
+ private def toMarkdown (es : Iterable [Node ], allEntities : Set [ Entity ] ): Either [ Error , String ] = {
27
37
def isLink (e : Element ): Boolean = e.tagName == " a"
28
- def toMarkdownLink (e : Element ): String = s " [ ${extractName(e.text)}]( ${e.text}) "
29
- def extractName (fullPath : String ): String = fullPath.split('.' ).last
38
+ def toMarkdownLink (e : Element ) = lookupLinkFor(extractName(e)).map(l => s " [ $l]( ${e.text}) " )
39
+ def extractName (e : Element ): String = e.attr(" href" ).replace(" .html" , " " )
40
+ def lookupLinkFor (name : String ): Either [Error , String ] =
41
+ allEntities.find(_.name == name).map(_.link.replace(" /" , " ." )).toRight(MissingLink (name))
30
42
31
- es.foldLeft(" " ) { (acc, elem) =>
43
+ es.foldLeft(" " .asRight[ Error ] ) { (acc, elem) =>
32
44
elem match {
33
- case TextNode (s) => acc + s
34
- case ElementNode (e) if isLink(e) => acc + toMarkdownLink(e)
35
- case ElementNode (e) => acc + toMarkdown(e.childNodes)
45
+ case TextNode (s) => acc.map(_ + s)
46
+ case ElementNode (e) if isLink(e) =>
47
+ for {
48
+ a <- acc
49
+ l <- toMarkdownLink(e)
50
+ } yield a + l
51
+ case ElementNode (e) =>
52
+ for {
53
+ a <- acc
54
+ m <- toMarkdown(e.childNodes, allEntities)
55
+ } yield a + m
36
56
}
37
57
}
38
58
}
0 commit comments