Skip to content

Commit 9a72090

Browse files
authored
Merge pull request #2 from Acollie/feat/relational-db
Feat/relational db
2 parents 565da1a + a4872ab commit 9a72090

24 files changed

+341
-49
lines changed

awsx/const.go

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
package awsx
2+
3+
const (
4+
Region = "eu-west-1"
5+
)

ignore_list.yml config.yml

+3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
relational: true
12
websites:
23
- "https://www.example.com"
34
- "https://www.github.com"
5+
- "https://docs.github.com"
6+
- "https://partner.github.com"
47
- "https://www.google.com"
58
- "https://www.facebook.com"
69
- "https://www.youtube.com"

ignore_list/fetch.go config/fetch.go

+8-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package ignore_list
1+
package config
22

33
import (
44
"gopkg.in/yaml.v3"
@@ -8,14 +8,16 @@ import (
88
)
99

1010
type IgnoreList struct {
11-
Website map[string]bool
11+
Website map[string]bool
12+
Relational bool
1213
}
1314
type rawIgnoreList struct {
14-
Websites []string `yaml:"websites"`
15+
Websites []string `yaml:"websites"`
16+
Relational bool `yaml:"relational"`
1517
}
1618

1719
func Fetch() *IgnoreList {
18-
yamlFile, err := ioutil.ReadFile("ignore_list.yml")
20+
yamlFile, err := ioutil.ReadFile("config.yml")
1921
if err != nil {
2022
log.Printf("yamlFile.Get err #%v ", err)
2123
}
@@ -26,7 +28,8 @@ func Fetch() *IgnoreList {
2628
}
2729

2830
conf := &IgnoreList{
29-
Website: make(map[string]bool),
31+
Website: make(map[string]bool),
32+
Relational: raw.Relational,
3033
}
3134

3235
for _, website := range raw.Websites {

ignore_list/fetch_test.go config/fetch_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package ignore_list
1+
package config
22

33
import (
44
"github.com/stretchr/testify/require"
@@ -7,7 +7,7 @@ import (
77

88
func TestIgnoreList(t *testing.T) {
99

10-
t.Run("Fetch ignore list ", func(t *testing.T) {
10+
t.Run("FetchDDB ignore list ", func(t *testing.T) {
1111
conf := IgnoreList{
1212
Website: map[string]bool{
1313
"github.com": true,

docker-compose.yml

+10-1
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,13 @@ services:
2121
- DATA_DIR=/tmp/localstack-data
2222
volumes:
2323
- "./localstack-data:/tmp/localstack-data"
24-
- "/var/run/docker.sock:/var/run/docker.sock"
24+
- "/var/run/docker.sock:/var/run/docker.sock"
25+
postgres:
26+
container_name: webcrawler-postgres
27+
image: postgres
28+
ports:
29+
- "5431:5432"
30+
environment:
31+
POSTGRES_USER: root
32+
POSTGRES_PASSWORD: root
33+
POSTGRES_DB: webcrawler

dynamo_db_x/add.go dynamoDBx/add.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package dynamo_db_x
1+
package dynamoDBx
22

33
import (
44
"context"

dynamo_db_x/fetch.go dynamoDBx/fetch.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package dynamo_db_x
1+
package dynamoDBx
22

33
import (
44
"context"

dynamo_db_x/remove.go dynamoDBx/remove.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package dynamo_db_x
1+
package dynamoDBx
22

33
import (
44
"context"

dynamo_db_x/type.go dynamoDBx/type.go

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
package dynamo_db_x
1+
package dynamoDBx
22

33
import (
44
"github.com/aws/aws-sdk-go-v2/aws"
55
"github.com/aws/aws-sdk-go-v2/service/dynamodb"
66
"os"
7+
"webcrawler/awsx"
78
)
89

910
type DB struct {
@@ -13,15 +14,15 @@ type DB struct {
1314
}
1415

1516
func New(pageName string, websiteName string, cfg aws.Config) *DB {
16-
cfg.Region = "us-west-2"
17+
cfg.Region = awsx.Region
1718
sessionClient := dynamodb.NewFromConfig(cfg)
1819

1920
if os.Getenv("ENVIRONMENT") == "local" {
2021
cfg.EndpointResolver = aws.EndpointResolverFunc(func(service, region string) (aws.Endpoint, error) {
2122
return aws.Endpoint{
2223
PartitionID: "aws",
2324
URL: "http://localhost:8000",
24-
SigningRegion: "us-west-2",
25+
SigningRegion: awsx.Region,
2526
}, nil
2627
})
2728
sessionClient = dynamodb.NewFromConfig(cfg)

dynamo_db_x/update.go dynamoDBx/update.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package dynamo_db_x
1+
package dynamoDBx
22

33
import (
44
"context"

go.mod

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ require (
1010
github.com/aws/aws-sdk-go-v2/service/dynamodb v1.27.1
1111
github.com/aws/aws-sdk-go-v2/service/sqs v1.29.6
1212
github.com/joho/godotenv v1.5.1
13+
github.com/lib/pq v1.10.9
1314
github.com/stretchr/testify v1.6.1
1415
github.com/temoto/robotstxt v1.1.2
16+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c
1517
)
1618

1719
require (
@@ -33,5 +35,4 @@ require (
3335
github.com/pmezard/go-difflib v1.0.0 // indirect
3436
golang.org/x/net v0.17.0 // indirect
3537
golang.org/x/text v0.13.0 // indirect
36-
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect
3738
)

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGw
4646
github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
4747
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
4848
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
49+
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
50+
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
4951
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
5052
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
5153
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=

handler/flow.go

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ func (h *Server) Scan(ctx context.Context) {
2828
defer wg.Done()
2929
if h.Config.Ignore(link.Url) {
3030
log.Printf("skipping domain")
31+
h.Queue.Remove(ctx, *link.Handler)
3132
return
3233
}
3334

handler/flow_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import (
66
"os"
77
"testing"
88
"webcrawler/awsx"
9-
"webcrawler/dynamo_db_x"
9+
"webcrawler/dynamoDBx"
1010

1111
"webcrawler/queue"
1212
)
@@ -19,7 +19,7 @@ func setup(t *testing.T, ctx context.Context) Server {
1919
os.Getenv("LINKS_QUEUE"),
2020
cfg,
2121
)
22-
dbClient := dynamo_db_x.New(
22+
dbClient := dynamoDBx.New(
2323
os.Getenv("DB_TABLE_PAGE"),
2424
os.Getenv("DB_TABLE_WEBSITE"),
2525
cfg,

handler/type.go

+6-3
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
package handler
22

33
import (
4-
"webcrawler/ignore_list"
4+
"webcrawler/config"
55
"webcrawler/queue"
66
"webcrawler/site"
7+
"webcrawler/sqlRelational"
78
)
89

910
type Server struct {
1011
Queue queue.HandlerI
1112
Db DBi
12-
Config *ignore_list.IgnoreList
13+
Config *config.IgnoreList
14+
DB *sqlRelational.SqlDB
1315
}
1416

1517
type DBi interface {
@@ -22,10 +24,11 @@ type DBi interface {
2224
UpdateWebsite(page site.Page, website site.Website) error
2325
}
2426

25-
func New(db DBi, queue *queue.Handler, conf *ignore_list.IgnoreList) Server {
27+
func New(db DBi, queue *queue.Handler, sqlDB *sqlRelational.SqlDB, conf *config.IgnoreList) Server {
2628
return Server{
2729
Db: db,
2830
Queue: queue,
31+
DB: sqlDB,
2932
Config: conf,
3033
}
3134

main.go

+19-8
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,24 @@ package main
22

33
import (
44
"context"
5+
"fmt"
56
"github.com/joho/godotenv"
67
"log"
78
"os"
89
"webcrawler/awsx"
9-
"webcrawler/dynamo_db_x"
10+
"webcrawler/config"
11+
"webcrawler/dynamoDBx"
1012
"webcrawler/handler"
11-
"webcrawler/ignore_list"
1213
"webcrawler/queue"
14+
"webcrawler/sqlRelational"
1315
)
1416

1517
func main() {
16-
err := godotenv.Load()
18+
1719
ctx := context.Background()
18-
conf := ignore_list.Fetch()
20+
err := godotenv.Load()
21+
//ctx := context.Background()
22+
conf := config.Fetch()
1923
if err != nil {
2024
log.Fatalf("Failed to load .env file with error: %v", err)
2125
}
@@ -32,15 +36,22 @@ func main() {
3236
os.Getenv("LINKS_QUEUE"),
3337
cfg,
3438
)
35-
dbClient := dynamo_db_x.New(
39+
dbClient := dynamoDBx.New(
3640
os.Getenv("DB_TABLE_PAGE"),
3741
os.Getenv("DB_TABLE_WEBSITE"),
3842
cfg,
3943
)
40-
initalLink := queue.NewMessage("https://alexcollie.com")
44+
sqlClient := sqlRelational.New(
45+
os.Getenv("SQL_TABLE"),
46+
)
47+
initialLink := queue.NewMessage("https://bbc.co.uk")
4148

42-
server := handler.New(dbClient, sqsClient, conf)
43-
server.Queue.Add(ctx, initalLink)
49+
server := handler.New(dbClient, sqsClient, sqlClient, conf)
50+
err = server.Queue.Add(ctx, initialLink)
51+
if err != nil {
52+
err = fmt.Errorf("add %s", err)
53+
panic(err)
54+
}
4455
server.Scan(ctx)
4556

4657
}

main.tf

+2-4
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@ provider "aws" {
22
region = "eu-west-1"
33
}
44
resource "aws_sqs_queue" "LinksQueue" {
5-
name = "LinksQueue.fifo"
6-
message_retention_seconds = 3600
7-
fifo_queue = true
8-
content_based_deduplication = true
5+
name = "LinksQueue"
6+
message_retention_seconds = 3600
97
}
108

119

site/types.go

+7-15
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,14 @@
11
package site
22

3-
import (
4-
"github.com/anaskhan96/soup"
5-
)
6-
7-
type pageI interface {
8-
Fetch(url string) (*soup.Root, error)
9-
Save(website Page) error
10-
}
113
type Page struct {
12-
Url string `dynamodbav:"PageURL"`
13-
Title string `dynamodbav:"title"`
14-
Body string `dynamodbav:"body"`
15-
BaseURL string `dynamodbav:"BaseURL"`
4+
Url string `dynamodbav:"PageURL" sql:"url"`
5+
Title string `dynamodbav:"title" sql:"title"`
6+
Body string `dynamodbav:"body" sql:"body"`
7+
BaseURL string `dynamodbav:"BaseURL" sql:"baseUrl"`
168
}
179

1810
type Website struct {
19-
Url string `dynamodbav:"BaseURL"`
20-
Links []string `dynamodbav:"links"`
21-
ProminenceValue float64 `dynamodbav:"promanceValue"`
11+
Url string `dynamodbav:"BaseURL" sql:"baseurl"`
12+
Links []string `dynamodbav:"links" sql:"links"`
13+
ProminenceValue float64 `dynamodbav:"promanceValue" sql:"promanceValue"`
2214
}

sqlRelational/add.go

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package sqlRelational
2+
3+
import (
4+
"fmt"
5+
"webcrawler/site"
6+
)
7+
8+
func (c *SqlDB) AddWebsite(website site.Website) error {
9+
queryString := fmt.Sprintf("INSERT INTO website (baseurl, promancevalue) VALUES ('%s', %f)", website.Url, website.ProminenceValue)
10+
_, err := c.Client.Exec(queryString)
11+
12+
return err
13+
}
14+
15+
func (c *SqlDB) AddPage(page site.Page) error {
16+
queryString := fmt.Sprintf("INSERT INTO page (pageurl, title, body,baseurl) VALUES ('%s', '%s', '%s', '%s')", page.Url, page.Title, page.Body, page.BaseURL)
17+
_, err := c.Client.Exec(queryString)
18+
19+
return err
20+
}

sqlRelational/conn.go

+35
Original file line numberDiff line numberDiff line change
@@ -1 +1,36 @@
11
package sqlRelational
2+
3+
import (
4+
"database/sql"
5+
"fmt"
6+
_ "github.com/lib/pq"
7+
"os"
8+
)
9+
10+
type SqlDB struct {
11+
Client *sql.DB
12+
DBName string
13+
}
14+
15+
func connect(dbName string) *sql.DB {
16+
host := os.Getenv("POSTGRES_HOST")
17+
password := os.Getenv("POSTGRES_PASSWORD")
18+
user := os.Getenv("POSTGRES_USER")
19+
20+
psqlInfo := fmt.Sprintf("host=%s port=%d user=%s "+
21+
"password=%s dbname=%s sslmode=disable",
22+
host, port, user, password, dbName)
23+
24+
client, err := sql.Open("postgres", psqlInfo)
25+
if err != nil {
26+
panic(err)
27+
}
28+
return client
29+
}
30+
31+
func New(dbName string) *SqlDB {
32+
return &SqlDB{
33+
Client: connect(dbName),
34+
DBName: dbName,
35+
}
36+
}

0 commit comments

Comments
 (0)