@@ -8,21 +8,31 @@ import (
8
8
"webcrawler/queue"
9
9
)
10
10
11
- func GetLinks (fetchingURL string , body string ) ([]queue.Message , error ) {
11
+ const (
12
+ MaxDepth = 8
13
+ )
14
+
15
+ func GetLinks (fetchingURL string , body string ) ([]string , error ) {
12
16
var links []string
13
17
doc := soup .HTMLParse (body )
14
18
for _ , link := range doc .FindAll ("a" ) {
15
19
link := resolveURL (fetchingURL , link .Attrs ()["href" ])
16
20
links = append (links , link )
17
21
}
18
22
23
+ links = removeLargeWebSites (links )
19
24
links = removeAnchors (links )
20
25
links = removeDuplicates (links )
21
26
links = removeMailTo (links )
22
- convertedLinks := convertLinksToQueueMessage (links )
27
+ links = removeDepthLinks (links )
28
+
29
+ return links , nil
30
+ }
23
31
24
- return convertedLinks , nil
32
+ func ResolveLinkToQueueMessage (links []string ) []queue.Message {
33
+ return convertLinksToQueueMessage (links )
25
34
}
35
+
26
36
func convertLinksToQueueMessage (links []string ) []queue.Message {
27
37
messages := []queue.Message {}
28
38
for _ , link := range links {
@@ -57,6 +67,22 @@ func removeDuplicates(links []string) []string {
57
67
return result
58
68
}
59
69
70
+ func removeDepthLinks (links []string ) []string {
71
+ var result []string
72
+ for _ , link := range links {
73
+ if testDepthLink (link , MaxDepth ) {
74
+ result = append (result , link )
75
+ }
76
+
77
+ }
78
+ return result
79
+ }
80
+
81
+ func testDepthLink (link string , maxDepth int ) bool {
82
+ res := strings .Split (link , "/" )
83
+ return len (res ) <= maxDepth + 3
84
+ }
85
+
60
86
func removeAnchors (links []string ) []string {
61
87
var result []string
62
88
for _ , link := range links {
@@ -84,3 +110,16 @@ func removeMailTo(links []string) []string {
84
110
}
85
111
return result
86
112
}
113
+
114
+ func removeLargeWebSites (links []string ) []string {
115
+ largeWebsites := []string {"facebook.com" , "twitter.com" , "instagram.com" , "youtube.com" , "linkedin.com" , "pinterest.com" , "tumblr.com" , "reddit.com" , "snapchat.com" , "whatsapp.com" , "quora.com" , "flickr.com" , "vimeo.com" , "medium.com" , "vk.com" , "soundcloud.com" }
116
+
117
+ for _ , link := range links {
118
+ for _ , website := range largeWebsites {
119
+ if strings .Contains (link , website ) {
120
+ continue
121
+ }
122
+ }
123
+ }
124
+ return links
125
+ }
0 commit comments