Skip to content

Commit aa78c3a

Browse files
authored
Merge pull request #3 from Acollie/feat/graph-db
Feat/graph db
2 parents 9a72090 + 7ec5169 commit aa78c3a

22 files changed

+425
-96
lines changed

docker-compose.yml

+13-7
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,18 @@ services:
2222
volumes:
2323
- "./localstack-data:/tmp/localstack-data"
2424
- "/var/run/docker.sock:/var/run/docker.sock"
25-
postgres:
26-
container_name: webcrawler-postgres
27-
image: postgres
25+
neo4j:
26+
image: neo4j:latest
27+
container_name: neo4j
2828
ports:
29-
- "5431:5432"
29+
- "7474:7474"
30+
- "7687:7687"
3031
environment:
31-
POSTGRES_USER: root
32-
POSTGRES_PASSWORD: root
33-
POSTGRES_DB: webcrawler
32+
- NEO4J_AUTH=neo4j/test
33+
- NEO4J_ACCEPT_LICENSE_AGREEMENT=yes
34+
- NEO4J_dbms_security_auth__minimum__password__length=4
35+
volumes:
36+
- "./docker/neo4j/data:/data"
37+
- "./docker/neo4j/logs:/logs"
38+
- "./docker/neo4j/import:/var/lib/neo4j/import"
39+
- "./docker/neo4j/plugins:/plugins"

dynamoDBx/add.go

+2-4
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ import (
88
"webcrawler/site"
99
)
1010

11-
func (db *DB) AddPage(website site.Page) error {
12-
ctx := context.Background()
11+
func (db *DB) AddPage(ctx context.Context, website site.Page) error {
1312
av, err := attributevalue.MarshalMap(website)
1413
if err != nil {
1514
return err
@@ -22,8 +21,7 @@ func (db *DB) AddPage(website site.Page) error {
2221
return err
2322
}
2423

25-
func (db *DB) AddWebsite(website site.Website) error {
26-
ctx := context.Background()
24+
func (db *DB) AddWebsite(ctx context.Context, website site.Website) error {
2725
av, err := attributevalue.MarshalMap(website)
2826
if err != nil {
2927
return err

dynamoDBx/fetch.go

+4-6
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ import (
88
"webcrawler/site"
99
)
1010

11-
func (db *DB) FetchWebsite(website string) (*site.Website, error) {
12-
ctx := context.TODO()
11+
func (db *DB) FetchWebsite(ctx context.Context, website string) (*site.Website, error) {
1312
resp, err := db.session.GetItem(ctx, &dynamodb.GetItemInput{
1413
TableName: &db.websiteNameTable,
1514
Key: map[string]types.AttributeValue{
@@ -29,8 +28,7 @@ func (db *DB) FetchWebsite(website string) (*site.Website, error) {
2928
return &websiteFormat, nil
3029
}
3130

32-
func (db *DB) FetchPage(website string) (*site.Page, error) {
33-
ctx := context.TODO()
31+
func (db *DB) FetchPage(ctx context.Context, website string) (*site.Page, error) {
3432
resp, err := db.session.GetItem(ctx, &dynamodb.GetItemInput{
3533
TableName: &db.pageNameTable,
3634
Key: map[string]types.AttributeValue{
@@ -43,14 +41,14 @@ func (db *DB) FetchPage(website string) (*site.Page, error) {
4341
if resp != nil {
4442
return nil, nil
4543
}
46-
page, err := formatPage(resp)
44+
page, err := formatPage(ctx, resp)
4745
if err != nil {
4846
return nil, err
4947
}
5048
return &page, nil
5149
}
5250

53-
func formatPage(input *dynamodb.GetItemOutput) (site.Page, error) {
51+
func formatPage(ctx context.Context, input *dynamodb.GetItemOutput) (site.Page, error) {
5452
var page site.Page
5553
err := attributevalue.UnmarshalMap(input.Item, &page)
5654
if err != nil {

dynamoDBx/remove.go

+2-4
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ import (
77
"webcrawler/site"
88
)
99

10-
func (db *DB) RemovePage(website site.Page) error {
11-
ctx := context.Background()
10+
func (db *DB) RemovePage(ctx context.Context, website site.Page) error {
1211
_, err := db.session.DeleteItem(ctx, &dynamodb.DeleteItemInput{
1312
TableName: &db.pageNameTable,
1413
Key: map[string]types.AttributeValue{
@@ -18,8 +17,7 @@ func (db *DB) RemovePage(website site.Page) error {
1817
return err
1918
}
2019

21-
func (db *DB) RemoveWebsite(website site.Website) error {
22-
ctx := context.Background()
20+
func (db *DB) RemoveWebsite(ctx context.Context, website site.Website) error {
2321
_, err := db.session.DeleteItem(ctx, &dynamodb.DeleteItemInput{
2422
TableName: &db.websiteNameTable,
2523
Key: map[string]types.AttributeValue{

dynamoDBx/update.go

+2-4
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@ import (
88
"webcrawler/site"
99
)
1010

11-
func (db *DB) UpdateWebsite(page site.Page, website site.Website) error {
12-
13-
ctx := context.Background()
14-
websiteDB, err := db.FetchWebsite(page.BaseURL)
11+
func (db *DB) UpdateWebsite(ctx context.Context, page site.Page, website site.Website) error {
12+
websiteDB, err := db.FetchWebsite(ctx, page.BaseURL)
1513
if err != nil {
1614
return err
1715
}

formating/text.go

+42-3
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,31 @@ import (
88
"webcrawler/queue"
99
)
1010

11-
func GetLinks(fetchingURL string, body string) ([]queue.Message, error) {
11+
const (
12+
MaxDepth = 8
13+
)
14+
15+
func GetLinks(fetchingURL string, body string) ([]string, error) {
1216
var links []string
1317
doc := soup.HTMLParse(body)
1418
for _, link := range doc.FindAll("a") {
1519
link := resolveURL(fetchingURL, link.Attrs()["href"])
1620
links = append(links, link)
1721
}
1822

23+
links = removeLargeWebSites(links)
1924
links = removeAnchors(links)
2025
links = removeDuplicates(links)
2126
links = removeMailTo(links)
22-
convertedLinks := convertLinksToQueueMessage(links)
27+
links = removeDepthLinks(links)
28+
29+
return links, nil
30+
}
2331

24-
return convertedLinks, nil
32+
func ResolveLinkToQueueMessage(links []string) []queue.Message {
33+
return convertLinksToQueueMessage(links)
2534
}
35+
2636
func convertLinksToQueueMessage(links []string) []queue.Message {
2737
messages := []queue.Message{}
2838
for _, link := range links {
@@ -57,6 +67,22 @@ func removeDuplicates(links []string) []string {
5767
return result
5868
}
5969

70+
func removeDepthLinks(links []string) []string {
71+
var result []string
72+
for _, link := range links {
73+
if testDepthLink(link, MaxDepth) {
74+
result = append(result, link)
75+
}
76+
77+
}
78+
return result
79+
}
80+
81+
func testDepthLink(link string, maxDepth int) bool {
82+
res := strings.Split(link, "/")
83+
return len(res) <= maxDepth+3
84+
}
85+
6086
func removeAnchors(links []string) []string {
6187
var result []string
6288
for _, link := range links {
@@ -84,3 +110,16 @@ func removeMailTo(links []string) []string {
84110
}
85111
return result
86112
}
113+
114+
func removeLargeWebSites(links []string) []string {
115+
largeWebsites := []string{"facebook.com", "twitter.com", "instagram.com", "youtube.com", "linkedin.com", "pinterest.com", "tumblr.com", "reddit.com", "snapchat.com", "whatsapp.com", "quora.com", "flickr.com", "vimeo.com", "medium.com", "vk.com", "soundcloud.com"}
116+
117+
for _, link := range links {
118+
for _, website := range largeWebsites {
119+
if strings.Contains(link, website) {
120+
continue
121+
}
122+
}
123+
}
124+
return links
125+
}

formating/text_test.go

+22
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,28 @@ func TestLinkResolve(t *testing.T) {
4343

4444
}
4545

46+
func TestRemoveDepthLinks(t *testing.T) {
47+
tests := []struct {
48+
url string
49+
expected bool
50+
}{
51+
{"http://www.example.com/", true},
52+
{"http://www.example.com/test", true},
53+
{"http://www.example.com/test/test", true},
54+
{"http://www.example.com/test/test/test", true},
55+
{"http://www.example.com/test/test/test/test", true},
56+
{"http://www.example.com/test/test/test/test/test", true},
57+
{"http://www.example.com/test/test/test/test/test/test", false},
58+
}
59+
t.Run("Test basic", func(t *testing.T) {
60+
for _, test := range tests {
61+
result := testDepthLink(test.url, 5)
62+
require.Equalf(t, result, test.expected, "Removing depth links failed %s", test.url)
63+
}
64+
65+
})
66+
}
67+
4668
func TestRemoveDuplicates(t *testing.T) {
4769
links := []string{"http://www.example.com/", "http://www.example.com/", "http://www.example.com/test", "http://www.example.com/test"}
4870
links = removeDuplicates(links)

go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ require (
1111
github.com/aws/aws-sdk-go-v2/service/sqs v1.29.6
1212
github.com/joho/godotenv v1.5.1
1313
github.com/lib/pq v1.10.9
14+
github.com/neo4j/neo4j-go-driver/v5 v5.20.0
1415
github.com/stretchr/testify v1.6.1
1516
github.com/temoto/robotstxt v1.1.2
1617
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
4848
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
4949
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
5050
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
51+
github.com/neo4j/neo4j-go-driver/v5 v5.20.0 h1:XnoAi6g6XRkX+wxWa3yM+f7PT2VUkGQfBGtGuJL4fsM=
52+
github.com/neo4j/neo4j-go-driver/v5 v5.20.0/go.mod h1:Vff8OwT7QpLm7L2yYr85XNWe9Rbqlbeb9asNXJTHO4k=
5153
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
5254
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
5355
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=

graphx/add.go

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
package graphx
2+
3+
import (
4+
"context"
5+
"github.com/neo4j/neo4j-go-driver/v5/neo4j"
6+
"webcrawler/site"
7+
)
8+
9+
func (g *Graph) AddWebsite(ctx context.Context, web site.Website) error {
10+
ses := g.Neo4j.NewSession(ctx, neo4j.SessionConfig{})
11+
_, err := ses.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
12+
_, err := tx.Run(ctx, `
13+
MERGE (w:Website {url: $url, ProminenceValue: $prominenceValue})
14+
on create set w.prominence = 1
15+
on create set w.crawled_at = datetime()
16+
on match set w.prominence = coalesce(w.prominence, 0) + 1
17+
18+
`, map[string]interface{}{
19+
"url": web.Url,
20+
"prominenceValue": web.ProminenceValue,
21+
})
22+
23+
return nil, err
24+
})
25+
return err
26+
}
27+
28+
func (g *Graph) AddLink(ctx context.Context, page site.Page) error {
29+
ses := g.Neo4j.NewSession(ctx, neo4j.SessionConfig{})
30+
var err error
31+
for _, link := range page.Links {
32+
_, err = ses.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
33+
_, err := tx.Run(ctx, `
34+
MERGE (page1:Page {url: $callURL})
35+
MERGE (page2:Page {url: $linkTo})
36+
MERGE (page1)-[:LINK_TO]->(page2)
37+
MERGE (page2)-[:LINK_FROM]->(page1)
38+
on create set page2.prominence = 1
39+
on create set page1.prominence = 1
40+
on match set page2.prominence = coalesce(page2.prominence, 0) + 1
41+
on create set page1.crawled_at = datetime()
42+
`, map[string]interface{}{
43+
"callURL": page.Url,
44+
"linkTo": link,
45+
})
46+
return nil, err
47+
})
48+
49+
if err != nil {
50+
return err
51+
}
52+
}
53+
return err
54+
}
55+
56+
func (g *Graph) AddPage(ctx context.Context, web site.Website, page site.Page) error {
57+
ses := g.Neo4j.NewSession(ctx, neo4j.SessionConfig{})
58+
_, err := ses.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
59+
_, err := tx.Run(ctx, `
60+
MERGE (page:Page {url:$url, title:$title})
61+
MERGE (web:Website {baseURL:$baseURL})
62+
MERGE (web)-[:OWNED_BY]->(page)
63+
MERGE (page)-[:BELONGS_TO]->(web)
64+
`, map[string]any{
65+
"url": page.Url,
66+
"title": page.Title,
67+
"baseURL": web.Url,
68+
})
69+
70+
return nil, err
71+
})
72+
return err
73+
}

graphx/conn.go

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package graphx
2+
3+
import (
4+
"context"
5+
"github.com/neo4j/neo4j-go-driver/v5/neo4j"
6+
)
7+
8+
func Conn(ctx context.Context, user string, password string, url string) (neo4j.DriverWithContext, error) {
9+
dbUri := url
10+
dbUser := user
11+
dbPassword := password
12+
driver, err := neo4j.NewDriverWithContext(
13+
dbUri,
14+
neo4j.BasicAuth(dbUser, dbPassword, ""),
15+
)
16+
17+
if err != nil {
18+
return nil, err
19+
}
20+
err = driver.VerifyConnectivity(ctx)
21+
if err != nil {
22+
panic(err)
23+
}
24+
return driver, err
25+
}
26+
27+
func New(graph neo4j.DriverWithContext) *Graph {
28+
return &Graph{
29+
Neo4j: graph,
30+
}
31+
32+
}

graphx/type.go

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package graphx
2+
3+
import "github.com/neo4j/neo4j-go-driver/v5/neo4j"
4+
5+
type Graph struct {
6+
Neo4j neo4j.DriverWithContext
7+
}

0 commit comments

Comments
 (0)