Skip to content

Commit 93dae4f

Browse files
committed
feat: add data crawling data pipeline
1 parent cd2d1c9 commit 93dae4f

15 files changed

+591
-0
lines changed

Diff for: 1-data-crawling/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
TBD

Diff for: 1-data-crawling/config.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from pydantic_settings import BaseSettings, SettingsConfigDict
2+
3+
4+
class Settings(BaseSettings):
5+
model_config = SettingsConfigDict(env_file="../.env", env_file_encoding="utf-8")
6+
7+
MONGO_DATABASE_HOST: str = (
8+
"mongodb://mongo1:30001,mongo2:30002,mongo3:30003/?replicaSet=my-replica-set"
9+
)
10+
MONGO_DATABASE_NAME: str = "scrabble"
11+
12+
# Optional LinkedIn credentials for scraping your profile
13+
LINKEDIN_USERNAME: str | None = None
14+
LINKEDIN_PASSWORD: str | None = None
15+
16+
17+
settings = Settings()

Diff for: 1-data-crawling/crawlers/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .github import GithubCrawler
2+
from .linkedin import LinkedInCrawler
3+
from .medium import MediumCrawler
4+
5+
__all__ = ["GithubCrawler", "LinkedInCrawler", "MediumCrawler"]

Diff for: 1-data-crawling/crawlers/base.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import time
2+
from abc import ABC, abstractmethod
3+
from tempfile import mkdtemp
4+
5+
from db.documents import BaseDocument
6+
from selenium import webdriver
7+
from selenium.webdriver.chrome.options import Options
8+
9+
10+
class BaseCrawler(ABC):
11+
model: type[BaseDocument]
12+
13+
@abstractmethod
14+
def extract(self, link: str, **kwargs) -> None: ...
15+
16+
17+
class BaseAbstractCrawler(BaseCrawler, ABC):
18+
def __init__(self, scroll_limit: int = 5) -> None:
19+
options = webdriver.ChromeOptions()
20+
options.binary_location = "/opt/chrome/chrome"
21+
options.add_argument("--no-sandbox")
22+
options.add_argument("--headless=new")
23+
options.add_argument("--single-process")
24+
options.add_argument("--disable-dev-shm-usage")
25+
options.add_argument("--disable-gpu")
26+
options.add_argument("--log-level=3")
27+
options.add_argument("--disable-popup-blocking")
28+
options.add_argument("--disable-notifications")
29+
options.add_argument("--disable-dev-tools")
30+
options.add_argument("--ignore-certificate-errors")
31+
options.add_argument("--no-zygote")
32+
options.add_argument(f"--user-data-dir={mkdtemp()}")
33+
options.add_argument(f"--data-path={mkdtemp()}")
34+
options.add_argument(f"--disk-cache-dir={mkdtemp()}")
35+
options.add_argument("--remote-debugging-port=9222")
36+
37+
self.set_extra_driver_options(options)
38+
39+
self.scroll_limit = scroll_limit
40+
self.driver = webdriver.Chrome(
41+
service=webdriver.ChromeService("/opt/chromedriver"),
42+
options=options,
43+
)
44+
45+
def set_extra_driver_options(self, options: Options) -> None:
46+
pass
47+
48+
def login(self) -> None:
49+
pass
50+
51+
def scroll_page(self) -> None:
52+
"""Scroll through the LinkedIn page based on the scroll limit."""
53+
current_scroll = 0
54+
last_height = self.driver.execute_script("return document.body.scrollHeight")
55+
while True:
56+
self.driver.execute_script(
57+
"window.scrollTo(0, document.body.scrollHeight);"
58+
)
59+
time.sleep(5)
60+
new_height = self.driver.execute_script("return document.body.scrollHeight")
61+
if new_height == last_height or (
62+
self.scroll_limit and current_scroll >= self.scroll_limit
63+
):
64+
break
65+
last_height = new_height
66+
current_scroll += 1

Diff for: 1-data-crawling/crawlers/github.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import os
2+
import shutil
3+
import subprocess
4+
import tempfile
5+
6+
from aws_lambda_powertools import Logger
7+
8+
from crawlers.base import BaseCrawler
9+
from db.documents import RepositoryDocument
10+
11+
logger = Logger(service="llm-twin-course/crawler")
12+
13+
14+
class GithubCrawler(BaseCrawler):
15+
model = RepositoryDocument
16+
17+
def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
18+
super().__init__()
19+
self._ignore = ignore
20+
21+
def extract(self, link: str, **kwargs) -> None:
22+
logger.info(f"Starting scrapping GitHub repository: {link}")
23+
24+
repo_name = link.rstrip("/").split("/")[-1]
25+
26+
local_temp = tempfile.mkdtemp()
27+
28+
try:
29+
os.chdir(local_temp)
30+
subprocess.run(["git", "clone", link])
31+
32+
repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])
33+
34+
tree = {}
35+
for root, dirs, files in os.walk(repo_path):
36+
dir = root.replace(repo_path, "").lstrip("/")
37+
if dir.startswith(self._ignore):
38+
continue
39+
40+
for file in files:
41+
if file.endswith(self._ignore):
42+
continue
43+
file_path = os.path.join(dir, file)
44+
with open(os.path.join(root, file), "r", errors="ignore") as f:
45+
tree[file_path] = f.read().replace(" ", "")
46+
47+
instance = self.model(
48+
name=repo_name, link=link, content=tree, owner_id=kwargs.get("user")
49+
)
50+
instance.save()
51+
52+
except Exception:
53+
raise
54+
finally:
55+
shutil.rmtree(local_temp)
56+
57+
logger.info(f"Finished scrapping GitHub repository: {link}")

Diff for: 1-data-crawling/crawlers/linkedin.py

+156
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import time
2+
from typing import Dict, List
3+
4+
from aws_lambda_powertools import Logger
5+
from bs4 import BeautifulSoup
6+
from bs4.element import Tag
7+
from errors import ImproperlyConfigured
8+
from selenium.webdriver.common.by import By
9+
10+
from db.documents import PostDocument
11+
from crawlers.base import BaseAbstractCrawler
12+
from config import settings
13+
14+
logger = Logger(service="decodingml/crawler")
15+
16+
17+
class LinkedInCrawler(BaseAbstractCrawler):
18+
model = PostDocument
19+
20+
def set_extra_driver_options(self, options) -> None:
21+
options.add_experimental_option("detach", True)
22+
23+
def extract(self, link: str, **kwargs):
24+
logger.info(f"Starting scrapping data for profile: {link}")
25+
26+
self.login()
27+
28+
soup = self._get_page_content(link)
29+
30+
data = {
31+
"Name": self._scrape_section(soup, "h1", class_="text-heading-xlarge"),
32+
"About": self._scrape_section(soup, "div", class_="display-flex ph5 pv3"),
33+
"Main Page": self._scrape_section(soup, "div", {"id": "main-content"}),
34+
"Experience": self._scrape_experience(link),
35+
"Education": self._scrape_education(link),
36+
}
37+
38+
self.driver.get(link)
39+
time.sleep(5)
40+
button = self.driver.find_element(
41+
By.CSS_SELECTOR,
42+
".app-aware-link.profile-creator-shared-content-view__footer-action",
43+
)
44+
button.click()
45+
46+
# Scrolling and scraping posts
47+
self.scroll_page()
48+
soup = BeautifulSoup(self.driver.page_source, "html.parser")
49+
post_elements = soup.find_all(
50+
"div",
51+
class_="update-components-text relative update-components-update-v2__commentary",
52+
)
53+
buttons = soup.find_all("button", class_="update-components-image__image-link")
54+
post_images = self._extract_image_urls(buttons)
55+
56+
posts = self._extract_posts(post_elements, post_images)
57+
logger.info(f"Found {len(posts)} posts for profile: {link}")
58+
59+
self.driver.close()
60+
61+
self.model.bulk_insert(
62+
[
63+
PostDocument(
64+
platform="linkedin", content=post, author_id=kwargs.get("user")
65+
)
66+
for post in posts
67+
]
68+
)
69+
70+
logger.info(f"Finished scrapping data for profile: {link}")
71+
72+
def _scrape_section(self, soup: BeautifulSoup, *args, **kwargs) -> str:
73+
"""Scrape a specific section of the LinkedIn profile."""
74+
# Example: Scrape the 'About' section
75+
parent_div = soup.find(*args, **kwargs)
76+
return parent_div.get_text(strip=True) if parent_div else ""
77+
78+
def _extract_image_urls(self, buttons: List[Tag]) -> Dict[str, str]:
79+
"""
80+
Extracts image URLs from button elements.
81+
82+
Args:
83+
buttons (List[Tag]): A list of BeautifulSoup Tag objects representing buttons.
84+
85+
Returns:
86+
Dict[str, str]: A dictionary mapping post indexes to image URLs.
87+
"""
88+
post_images = {}
89+
for i, button in enumerate(buttons):
90+
img_tag = button.find("img")
91+
if img_tag and "src" in img_tag.attrs:
92+
post_images[f"Post_{i}"] = img_tag["src"]
93+
else:
94+
logger.warning("No image found in this button")
95+
return post_images
96+
97+
def _get_page_content(self, url: str) -> BeautifulSoup:
98+
"""Retrieve the page content of a given URL."""
99+
self.driver.get(url)
100+
time.sleep(5)
101+
return BeautifulSoup(self.driver.page_source, "html.parser")
102+
103+
def _extract_posts(
104+
self, post_elements: List[Tag], post_images: Dict[str, str]
105+
) -> Dict[str, Dict[str, str]]:
106+
"""
107+
Extracts post texts and combines them with their respective images.
108+
109+
Args:
110+
post_elements (List[Tag]): A list of BeautifulSoup Tag objects representing post elements.
111+
post_images (Dict[str, str]): A dictionary containing image URLs mapped by post index.
112+
113+
Returns:
114+
Dict[str, Dict[str, str]]: A dictionary containing post data with text and optional image URL.
115+
"""
116+
posts_data = {}
117+
for i, post_element in enumerate(post_elements):
118+
post_text = post_element.get_text(strip=True, separator="\n")
119+
post_data = {"text": post_text}
120+
if f"Post_{i}" in post_images:
121+
post_data["image"] = post_images[f"Post_{i}"]
122+
posts_data[f"Post_{i}"] = post_data
123+
return posts_data
124+
125+
def _scrape_experience(self, profile_url: str) -> str:
126+
"""Scrapes the Experience section of the LinkedIn profile."""
127+
self.driver.get(profile_url + "/details/experience/")
128+
time.sleep(5)
129+
soup = BeautifulSoup(self.driver.page_source, "html.parser")
130+
experience_content = soup.find("section", {"id": "experience-section"})
131+
return experience_content.get_text(strip=True) if experience_content else ""
132+
133+
def _scrape_education(self, profile_url: str) -> str:
134+
self.driver.get(profile_url + "/details/education/")
135+
time.sleep(5)
136+
soup = BeautifulSoup(self.driver.page_source, "html.parser")
137+
education_content = soup.find("section", {"id": "education-section"})
138+
return education_content.get_text(strip=True) if education_content else ""
139+
140+
def login(self):
141+
"""Log in to LinkedIn."""
142+
self.driver.get("https://www.linkedin.com/login")
143+
if not settings.LINKEDIN_USERNAME and not settings.LINKEDIN_PASSWORD:
144+
raise ImproperlyConfigured(
145+
"LinkedIn scraper requires an valid account to perform extraction"
146+
)
147+
148+
self.driver.find_element(By.ID, "username").send_keys(
149+
settings.LINKEDIN_USERNAME
150+
)
151+
self.driver.find_element(By.ID, "password").send_keys(
152+
settings.LINKEDIN_PASSWORD
153+
)
154+
self.driver.find_element(
155+
By.CSS_SELECTOR, ".login__form_action_container button"
156+
).click()

Diff for: 1-data-crawling/crawlers/medium.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from aws_lambda_powertools import Logger
2+
from bs4 import BeautifulSoup
3+
from selenium.webdriver.common.by import By
4+
5+
from crawlers.base import BaseAbstractCrawler
6+
from db.documents import ArticleDocument
7+
8+
logger = Logger(service="decodingml/crawler")
9+
10+
11+
class MediumCrawler(BaseAbstractCrawler):
12+
model = ArticleDocument
13+
14+
def set_extra_driver_options(self, options) -> None:
15+
options.add_argument(r"--profile-directory=Profile 2")
16+
17+
def extract(self, link: str, **kwargs) -> None:
18+
logger.info(f"Starting scrapping Medium article: {link}")
19+
20+
self.driver.get(link)
21+
self.scroll_page()
22+
23+
soup = BeautifulSoup(self.driver.page_source, "html.parser")
24+
title = soup.find_all("h1", class_="pw-post-title")
25+
subtitle = soup.find_all("h2", class_="pw-subtitle-paragraph")
26+
27+
data = {
28+
"Title": title[0].string if title else None,
29+
"Subtitle": subtitle[0].string if subtitle else None,
30+
"Content": soup.get_text(),
31+
}
32+
33+
logger.info(f"Successfully scraped and saved article: {link}")
34+
self.driver.close()
35+
instance = self.model(
36+
platform="medium", content=data, link=link, author_id=kwargs.get("user")
37+
)
38+
instance.save()
39+
40+
def login(self):
41+
"""Log in to Medium with Google"""
42+
self.driver.get("https://medium.com/m/signin")
43+
self.driver.find_element(By.TAG_NAME, "a").click()

Diff for: 1-data-crawling/db/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)