Skip to content

Commit 45956ca

Browse files
committed
fix: used a hashset for large website check
1 parent aa78c3a commit 45956ca

File tree

3 files changed

+29
-15
lines changed

3 files changed

+29
-15
lines changed

formating/text.go

+16-7
Original file line numberDiff line numberDiff line change
@@ -112,14 +112,23 @@ func removeMailTo(links []string) []string {
112112
}
113113

114114
func removeLargeWebSites(links []string) []string {
115-
largeWebsites := []string{"facebook.com", "twitter.com", "instagram.com", "youtube.com", "linkedin.com", "pinterest.com", "tumblr.com", "reddit.com", "snapchat.com", "whatsapp.com", "quora.com", "flickr.com", "vimeo.com", "medium.com", "vk.com", "soundcloud.com"}
115+
largeWebsites := map[string]struct{}{
116+
"facebook.com": {}, "twitter.com": {}, "instagram.com": {}, "youtube.com": {},
117+
"linkedin.com": {}, "pinterest.com": {}, "tumblr.com": {}, "reddit.com": {},
118+
"snapchat.com": {}, "whatsapp.com": {}, "quora.com": {}, "flickr.com": {},
119+
"vimeo.com": {}, "medium.com": {}, "vk.com": {}, "soundcloud.com": {},
120+
}
116121

117-
for _, link := range links {
118-
for _, website := range largeWebsites {
119-
if strings.Contains(link, website) {
120-
continue
121-
}
122+
var result []string
123+
for i, link := range links {
124+
url, _ := url.Parse(link)
125+
if _, ok := largeWebsites[url.Hostname()]; ok {
126+
continue
127+
}
128+
result = append(result, links[i])
129+
if len(result) == 100 {
130+
break
122131
}
123132
}
124-
return links
133+
return result
125134
}

handler/type.go

+10-7
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,18 @@ package handler
22

33
import (
44
"context"
5+
"webcrawler/config"
56
"webcrawler/dynamoDBx"
67
"webcrawler/graphx"
78
"webcrawler/queue"
89
"webcrawler/site"
910
)
1011

1112
type Server struct {
12-
Queue *queue.Handler
13-
Db *dynamoDBx.DB
14-
Graph *graphx.Graph
13+
Queue *queue.Handler
14+
Db *dynamoDBx.DB
15+
Graph *graphx.Graph
16+
Config *config.IgnoreList
1517
}
1618

1719
type DBi interface {
@@ -24,11 +26,12 @@ type DBi interface {
2426
UpdateWebsite(context.Context, site.Page, site.Website) error
2527
}
2628

27-
func New(db *dynamoDBx.DB, queue *queue.Handler, graph *graphx.Graph) Server {
29+
func New(db *dynamoDBx.DB, queue *queue.Handler, graph *graphx.Graph, config *config.IgnoreList) Server {
2830
return Server{
29-
Db: db,
30-
Queue: queue,
31-
Graph: graph,
31+
Db: db,
32+
Queue: queue,
33+
Graph: graph,
34+
Config: config,
3235
}
3336

3437
}

main.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"log"
77
"os"
88
"webcrawler/awsx"
9+
localConfig "webcrawler/config"
910
"webcrawler/dynamoDBx"
1011
"webcrawler/graphx"
1112
"webcrawler/handler"
@@ -42,8 +43,9 @@ func main() {
4243
log.Fatalf("Cannot connect to the graph database: %s", err)
4344
}
4445
graph := graphx.New(graphConn)
46+
config := localConfig.Fetch()
4547

46-
server := handler.New(dbClient, sqsClient, graph)
48+
server := handler.New(dbClient, sqsClient, graph, config)
4749

4850
initialLinks := []string{
4951
"https://blog.alexcollie.com/",

0 commit comments

Comments
 (0)