Skip to content

Commit 055ca29

Browse files
authored
Merge pull request #4 from Acollie/feat/ddb-backing
Feat Adding DDB backing
2 parents 45956ca + 21901ee commit 055ca29

File tree

6 files changed

+2727
-15
lines changed

6 files changed

+2727
-15
lines changed

dynamoDBx/add.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ import (
88
"webcrawler/site"
99
)
1010

11-
func (db *DB) AddPage(ctx context.Context, website site.Page) error {
12-
av, err := attributevalue.MarshalMap(website)
11+
func (db *DB) AddPage(ctx context.Context, page site.Page) error {
12+
av, err := attributevalue.MarshalMap(page)
1313
if err != nil {
1414
return err
1515
}

handler/flow.go

+12-2
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,19 @@ func (h *Server) Scan(ctx context.Context) {
3838
}
3939

4040
page, resp, err := site.NewPage(link.Url)
41+
if err != nil {
42+
log.Printf("fetching page %v", err)
43+
h.Queue.Remove(ctx, *link.Handler)
44+
return
45+
}
4146

4247
links, err := formating.GetLinks(link.Url, resp)
4348
if err != nil {
4449
h.Queue.Remove(ctx, *link.Handler)
4550
return
4651
}
4752
queueMessage := formating.ResolveLinkToQueueMessage(links)
53+
page.Links = links
4854

4955
website := site.NewWebsite(link.Url, queueMessage)
5056

@@ -55,13 +61,17 @@ func (h *Server) Scan(ctx context.Context) {
5561
return
5662
}
5763

58-
page.Links = links
59-
6064
if err := h.Queue.Remove(ctx, *link.Handler); err != nil {
6165
log.Printf("removing link from queue %v", err)
6266
return
6367
}
6468

69+
err = h.Db.AddPage(ctx, page)
70+
if err != nil {
71+
log.Printf("Adding page to db %s", err)
72+
return
73+
}
74+
6575
err = h.Graph.AddLink(ctx, page)
6676
if err != nil {
6777
log.Printf("Adding links to graph %s", err)

site/example.html

+2,475
Large diffs are not rendered by default.

site/format.go

+85-6
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,16 @@ import (
66
"io/ioutil"
77
"net/http"
88
"net/url"
9+
"regexp"
10+
"strings"
911
"time"
1012
"webcrawler/queue"
1113
)
1214

15+
const (
16+
MaxBodyLength = 10000
17+
)
18+
1319
func NewPage(fetchURL string) (Page, string, error) {
1420
client := &http.Client{
1521
Timeout: time.Second * 10, // Set timeout to 10 seconds
@@ -36,22 +42,38 @@ func NewPage(fetchURL string) (Page, string, error) {
3642
titleTag := doc.Find("title")
3743
title := ""
3844
if titleTag.Error == nil {
39-
// printing the title text
4045
title = titleTag.Text()
4146
} else {
4247
title = "title not found"
4348
}
44-
text := doc.FullText()
4549

4650
baseUrl, _ := url.Parse(fetchURL)
4751
return Page{
48-
Url: fetchURL,
49-
Title: title,
50-
Body: text,
51-
BaseURL: baseUrl.Hostname(),
52+
Url: fetchURL,
53+
Title: title,
54+
Body: fetchText(&doc),
55+
BaseURL: baseUrl.Hostname(),
56+
CrawledDate: uint64(time.Now().Unix()),
57+
Meta: fetchMeta(&doc),
5258
}, string(body), nil
5359
}
5460

61+
func fetchMeta(root *soup.Root) map[string]string {
62+
metaInformation := make(map[string]string)
63+
meta := root.FindAll("meta")
64+
for _, value := range meta {
65+
name := value.Attrs()["name"]
66+
content := value.Attrs()["content"]
67+
if name != "" && content != "" {
68+
metaInformation[name] = content
69+
}
70+
}
71+
if len(metaInformation) == 0 {
72+
return map[string]string{}
73+
}
74+
return metaInformation
75+
}
76+
5577
func NewWebsite(urlFull string, links []queue.Message) Website {
5678
hostName, _ := url.Parse(urlFull)
5779
return Website{
@@ -61,6 +83,63 @@ func NewWebsite(urlFull string, links []queue.Message) Website {
6183
}
6284
}
6385

86+
// This function gets all of the text from a soup file
87+
func fetchText(root *soup.Root) string {
88+
res := ""
89+
for _, node := range root.FindAll("p") {
90+
res += node.FullText()
91+
}
92+
for _, node := range root.FindAll("h1") {
93+
res += node.FullText()
94+
}
95+
for _, node := range root.FindAll("h2") {
96+
res += node.FullText()
97+
}
98+
for _, node := range root.FindAll("div") {
99+
res += node.FullText()
100+
}
101+
for _, node := range root.FindAll("span") {
102+
res += node.FullText()
103+
}
104+
105+
res = strings.ReplaceAll(res, "\n", "")
106+
res = strings.ReplaceAll(res, "\t", "")
107+
res = strings.ReplaceAll(res, "\r", "")
108+
res = strings.ReplaceAll(res, " ", " ")
109+
res = strings.ReplaceAll(res, "\u00A0", " ") // Non-breaking space
110+
res = strings.ReplaceAll(res, "\f", "") // Form feed
111+
res = strings.ReplaceAll(res, "\v", "") // Vertical tab
112+
res = strings.ReplaceAll(res, "\u200B", "") // Zero-width space
113+
res = strings.TrimSpace(res)
114+
115+
if len(res) < MaxBodyLength {
116+
return res
117+
}
118+
return res[:MaxBodyLength]
119+
}
120+
121+
func stripHTML(text *string) {
122+
re := regexp.MustCompile("<[^>]*>")
123+
*text = re.ReplaceAllString(*text, "")
124+
125+
}
126+
127+
func scriptEmptyLines(text *string) {
128+
re := regexp.MustCompile(`(?m)^\s*$[\r\n]*|<[^>]*>`)
129+
*text = re.ReplaceAllString(*text, "")
130+
131+
}
132+
133+
func stripJavascript(text *string) {
134+
re := regexp.MustCompile("(?i)<script[^>]*>(.*?)</script>")
135+
*text = re.ReplaceAllString(*text, "")
136+
}
137+
138+
func stripCSS(text *string) {
139+
re := regexp.MustCompile("(?i)<style[^>]*>(.*?)</style>")
140+
*text = re.ReplaceAllString(*text, "")
141+
}
142+
64143
func resolveMessageIntoLinks(messages []queue.Message) []string {
65144
var links []string
66145
for _, message := range messages {

site/format_test.go

+146
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
package site
2+
3+
import (
4+
"github.com/anaskhan96/soup"
5+
"github.com/stretchr/testify/require"
6+
"os"
7+
"testing"
8+
)
9+
10+
func loadExampleHTML(fileName string) string {
11+
data, err := os.ReadFile(fileName)
12+
if err != nil {
13+
panic(err)
14+
}
15+
return string(data)
16+
}
17+
18+
func TestFullPage(t *testing.T) {
19+
html := loadExampleHTML("example.html")
20+
root := soup.HTMLParse(html)
21+
res := fetchText(&root)
22+
require.LessOrEqual(t, len(res), MaxBodyLength, "Expected empty string, got %s", res)
23+
}
24+
25+
func TestFetchMeta(t *testing.T) {
26+
tests := []struct {
27+
name string
28+
input string
29+
expected map[string]string
30+
}{
31+
{
32+
name: "Test basic",
33+
input: `<meta name="description" content="This is a test description">`,
34+
expected: map[string]string{
35+
"description": "This is a test description",
36+
},
37+
},
38+
{
39+
name: "Test with multiple meta",
40+
input: `<meta name="description" content="This is a test description"><meta name="author" content="Alex Collie">`,
41+
expected: map[string]string{
42+
"description": "This is a test description",
43+
"author": "Alex Collie",
44+
},
45+
},
46+
{
47+
name: "Test with no meta",
48+
input: `<title>This is a test</title>`,
49+
expected: map[string]string{},
50+
},
51+
{
52+
name: "Test with no content",
53+
input: `<meta name="description">`,
54+
expected: map[string]string{},
55+
},
56+
{
57+
name: "Test with no name",
58+
input: `<meta content="This is a test description">`,
59+
expected: map[string]string{},
60+
},
61+
}
62+
63+
for _, test := range tests {
64+
t.Run(test.name, func(t *testing.T) {
65+
root := soup.HTMLParse(test.input)
66+
res := fetchMeta(&root)
67+
require.Equal(t, test.expected, res)
68+
})
69+
70+
}
71+
}
72+
73+
func TestStripJavascript(t *testing.T) {
74+
75+
tests := []struct {
76+
name string
77+
input string
78+
expected string
79+
}{
80+
{
81+
name: "Test basic",
82+
input: "This is a test <script>console.log('test')</script>",
83+
expected: "This is a test ",
84+
},
85+
{
86+
name: "Test with multiple scripts",
87+
input: "This is a test <script>console.log('test')</script> <script>console.log('test')</script>",
88+
expected: "This is a test ",
89+
},
90+
{
91+
name: "Test with no script",
92+
input: "This is a test",
93+
expected: "This is a test",
94+
},
95+
{
96+
name: "Complex ref test",
97+
input: `<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><title>Alex Collie&#x27;s personal site</title><meta name="description" content="Alex Collie&#x27;s personal website, a site for a software developer based in London"/><meta name="next-head-count" content="4"/><link rel="preload" href="/_next/static/css/090c144453b7cc79.css" as="style"/><link rel="stylesheet" href="/_next/static/css/090c144453b7cc79.css" data-n-g=""/><link rel="preload" href="/_next/static/css/ae4ed9c503fd1e33.css" as="style"/><link rel="stylesheet" href="/_next/static/css/ae4ed9c503fd1e33.css" data-n-p=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/_next/static/chunks/webpack-b8f8d6679aaa5f42.js" defer=""></script><script src="/_next/static/chunks/framework-66d32731bdd20e83.js" defer=""></script><script src="/_next/static/chunks/main-d190bcef2284c937.js" defer=""></script>`,
98+
expected: `<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><title>Alex Collie&#x27;s personal site</title><meta name="description" content="Alex Collie&#x27;s personal website, a site for a software developer based in London"/><meta name="next-head-count" content="4"/><link rel="preload" href="/_next/static/css/090c144453b7cc79.css" as="style"/><link rel="stylesheet" href="/_next/static/css/090c144453b7cc79.css" data-n-g=""/><link rel="preload" href="/_next/static/css/ae4ed9c503fd1e33.css" as="style"/><link rel="stylesheet" href="/_next/static/css/ae4ed9c503fd1e33.css" data-n-p=""/><noscript data-n-css=""></noscript>`,
99+
},
100+
{
101+
name: "Complex ref test 2",
102+
input: `<script>window.env={"AD_SLOT_CLIENT_INJECTOR_REGISTRY":"https://test.com"};</script>`,
103+
expected: ``,
104+
},
105+
}
106+
107+
for _, test := range tests {
108+
t.Run(test.name, func(t *testing.T) {
109+
result := test.input
110+
stripJavascript(&result)
111+
require.Equal(t, test.expected, result)
112+
})
113+
}
114+
}
115+
116+
func TestRemoveCSS(t *testing.T) {
117+
tests := []struct {
118+
name string
119+
input string
120+
expected string
121+
}{
122+
{
123+
name: "Test basic",
124+
input: "This is a test <style>body{color: red}</style>",
125+
expected: "This is a test ",
126+
},
127+
{
128+
name: "Test with multiple styles",
129+
input: "This is a test <style>body{color: red}</style> <style>body{color: red}</style>",
130+
expected: "This is a test ",
131+
},
132+
{
133+
name: "Test with no style",
134+
input: "This is a test",
135+
expected: "This is a test",
136+
},
137+
}
138+
139+
for _, test := range tests {
140+
t.Run(test.name, func(t *testing.T) {
141+
result := test.input
142+
stripCSS(&result)
143+
require.Equal(t, test.expected, result)
144+
})
145+
}
146+
}

site/types.go

+7-5
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ type pageI interface {
99
Save(website Page) error
1010
}
1111
type Page struct {
12-
Url string `dynamodbav:"PageURL"`
13-
Title string `dynamodbav:"title"`
14-
Body string `dynamodbav:"body"`
15-
BaseURL string `dynamodbav:"BaseURL"`
16-
Links []string
12+
Url string `dynamodbav:"PageURL"`
13+
Title string `dynamodbav:"title"`
14+
Body string `dynamodbav:"body"`
15+
BaseURL string `dynamodbav:"BaseURL"`
16+
Meta map[string]string `dynamodbav:"meta"`
17+
CrawledDate uint64 `dynamodbav:"crawledDate"`
18+
Links []string `dynamodbav:"-"`
1719
}
1820

1921
type Website struct {

0 commit comments

Comments
 (0)