Skip to content

Commit 565da1a

Browse files
authoredMar 31, 2024··
Merge pull request #1 from Acollie/feat/adding-db-tests
Feat adding tests
2 parents edd76a6 + fa7bc9c commit 565da1a

File tree

17 files changed

+145
-21
lines changed

17 files changed

+145
-21
lines changed
 

‎.github/workflows/terraform.yml

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
name: 'Terraform'
2+
on:
3+
push:
4+
branches:
5+
- main
6+
pull_request:
7+
8+
jobs:
9+
terraform:
10+
name: 'Terraform'
11+
runs-on: ubuntu-latest
12+
13+
steps:
14+
- name: Checkout
15+
uses: actions/checkout@v3
16+
17+
- name: Install Terraform
18+
uses: hashicorp/setup-terraform@v1
19+
with:
20+
terraform_version: 1.0.5
21+
22+
- name: Terraform Init
23+
run: terraform init
24+
25+
- name: Terraform Validate
26+
run: terraform validate

‎dynamoDBx/add.go ‎dynamo_db_x/add.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package dynamoDBx
1+
package dynamo_db_x
22

33
import (
44
"context"

‎dynamoDBx/fetch.go ‎dynamo_db_x/fetch.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package dynamoDBx
1+
package dynamo_db_x
22

33
import (
44
"context"

‎dynamoDBx/remove.go ‎dynamo_db_x/remove.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package dynamoDBx
1+
package dynamo_db_x
22

33
import (
44
"context"

‎dynamoDBx/type.go ‎dynamo_db_x/type.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package dynamoDBx
1+
package dynamo_db_x
22

33
import (
44
"github.com/aws/aws-sdk-go-v2/aws"

‎dynamoDBx/update.go ‎dynamo_db_x/update.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package dynamoDBx
1+
package dynamo_db_x
22

33
import (
44
"context"

‎handler/flow.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,12 @@ func (h *Server) Scan(ctx context.Context) {
2525

2626
wg.Add(1)
2727
go func() {
28-
2928
defer wg.Done()
29+
if h.Config.Ignore(link.Url) {
30+
log.Printf("skipping domain")
31+
return
32+
}
33+
3034
valid, err := site.FetchRobots(link.Url)
3135
if err != nil {
3236
log.Printf("fetching robots %v", err)

‎handler/flow_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import (
66
"os"
77
"testing"
88
"webcrawler/awsx"
9-
"webcrawler/dynamoDBx"
9+
"webcrawler/dynamo_db_x"
1010

1111
"webcrawler/queue"
1212
)
@@ -19,7 +19,7 @@ func setup(t *testing.T, ctx context.Context) Server {
1919
os.Getenv("LINKS_QUEUE"),
2020
cfg,
2121
)
22-
dbClient := dynamoDBx.New(
22+
dbClient := dynamo_db_x.New(
2323
os.Getenv("DB_TABLE_PAGE"),
2424
os.Getenv("DB_TABLE_WEBSITE"),
2525
cfg,

‎handler/type.go

+8-5
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
package handler
22

33
import (
4+
"webcrawler/ignore_list"
45
"webcrawler/queue"
56
"webcrawler/site"
67
)
78

89
type Server struct {
9-
Queue *queue.Handler
10-
Db DBi
10+
Queue queue.HandlerI
11+
Db DBi
12+
Config *ignore_list.IgnoreList
1113
}
1214

1315
type DBi interface {
@@ -20,10 +22,11 @@ type DBi interface {
2022
UpdateWebsite(page site.Page, website site.Website) error
2123
}
2224

23-
func New(db DBi, queue *queue.Handler) Server {
25+
func New(db DBi, queue *queue.Handler, conf *ignore_list.IgnoreList) Server {
2426
return Server{
25-
Db: db,
26-
Queue: queue,
27+
Db: db,
28+
Queue: queue,
29+
Config: conf,
2730
}
2831

2932
}

‎ignore_list.yml

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
websites:
2+
- "https://www.example.com"
3+
- "https://www.github.com"
4+
- "https://www.google.com"
5+
- "https://www.facebook.com"
6+
- "https://www.youtube.com"
7+
- "https://www.instagram.com"
8+
- "https://www.twitter.com"
9+
- "https://www.linkedin.com"
10+
- "https://www.pinterest.com"
11+
- "https://www.tumblr.com"
12+
- "https://www.reddit.com"
13+
- "https://www.wordpress.com"
14+
- "https://www.medium.com"

‎ignore_list/fetch.go

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package ignore_list
2+
3+
import (
4+
"gopkg.in/yaml.v3"
5+
"io/ioutil"
6+
"log"
7+
"net/url"
8+
)
9+
10+
type IgnoreList struct {
11+
Website map[string]bool
12+
}
13+
type rawIgnoreList struct {
14+
Websites []string `yaml:"websites"`
15+
}
16+
17+
func Fetch() *IgnoreList {
18+
yamlFile, err := ioutil.ReadFile("ignore_list.yml")
19+
if err != nil {
20+
log.Printf("yamlFile.Get err #%v ", err)
21+
}
22+
raw := &rawIgnoreList{}
23+
err = yaml.Unmarshal(yamlFile, raw)
24+
if err != nil {
25+
log.Fatalf("Unmarshal: %v", err)
26+
}
27+
28+
conf := &IgnoreList{
29+
Website: make(map[string]bool),
30+
}
31+
32+
for _, website := range raw.Websites {
33+
// Extract host from the url
34+
parsedUrl, err := url.Parse(website)
35+
if err != nil {
36+
log.Printf("Failed to parse URL from yaml file: %v, err: %v", website, err)
37+
continue
38+
}
39+
conf.Website[parsedUrl.Host] = true
40+
}
41+
return conf
42+
}
43+
44+
func (i *IgnoreList) Ignore(urlStr string) bool {
45+
46+
parsedURL, err := url.Parse(urlStr)
47+
if err != nil {
48+
log.Printf("Url parse error %v ", err)
49+
return false
50+
}
51+
52+
_, isIgnored := i.Website[parsedURL.Host]
53+
return isIgnored
54+
}

‎ignore_list/fetch_test.go

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package ignore_list
2+
3+
import (
4+
"github.com/stretchr/testify/require"
5+
"testing"
6+
)
7+
8+
func TestIgnoreList(t *testing.T) {
9+
10+
t.Run("Fetch ignore list ", func(t *testing.T) {
11+
conf := IgnoreList{
12+
Website: map[string]bool{
13+
"github.com": true,
14+
},
15+
}
16+
url := "https://github.com"
17+
require.Equal(t, conf.Ignore(url), true)
18+
require.Equal(t, conf.Ignore("https://alexcollie.com"), false)
19+
})
20+
}

‎main.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,16 @@ import (
66
"log"
77
"os"
88
"webcrawler/awsx"
9-
"webcrawler/dynamoDBx"
9+
"webcrawler/dynamo_db_x"
1010
"webcrawler/handler"
11+
"webcrawler/ignore_list"
1112
"webcrawler/queue"
1213
)
1314

1415
func main() {
15-
// Load .env file
1616
err := godotenv.Load()
1717
ctx := context.Background()
18+
conf := ignore_list.Fetch()
1819
if err != nil {
1920
log.Fatalf("Failed to load .env file with error: %v", err)
2021
}
@@ -31,14 +32,14 @@ func main() {
3132
os.Getenv("LINKS_QUEUE"),
3233
cfg,
3334
)
34-
dbClient := dynamoDBx.New(
35+
dbClient := dynamo_db_x.New(
3536
os.Getenv("DB_TABLE_PAGE"),
3637
os.Getenv("DB_TABLE_WEBSITE"),
3738
cfg,
3839
)
3940
initalLink := queue.NewMessage("https://alexcollie.com")
4041

41-
server := handler.New(dbClient, sqsClient)
42+
server := handler.New(dbClient, sqsClient, conf)
4243
server.Queue.Add(ctx, initalLink)
4344
server.Scan(ctx)
4445

‎main.tf

+4-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@ provider "aws" {
22
region = "eu-west-1"
33
}
44
resource "aws_sqs_queue" "LinksQueue" {
5-
name = "LinksQueue"
6-
message_retention_seconds = 3600
5+
name = "LinksQueue.fifo"
6+
message_retention_seconds = 3600
7+
fifo_queue = true
8+
content_based_deduplication = true
79
}
810

911

‎sqlRelational/conn.go

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package sqlRelational

‎sql_relational/consts.go ‎sqlRelational/consts.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package sql_relational
1+
package sqlRelational
22

33
const (
44
addWebsite = `insert () values into websites`

‎sql_relational/conn.go

-1
This file was deleted.

0 commit comments

Comments
 (0)
Please sign in to comment.