insoftai
diff --git a/Diff for: ‎1-data-crawling/README.md
+1 b/Diff for: ‎1-data-crawling/README.md
+1
diff --git a/Diff for: ‎1-data-crawling/config.py
+17 b/Diff for: ‎1-data-crawling/config.py
+17
diff --git a/Diff for: ‎1-data-crawling/crawlers/__init__.py
+5 b/Diff for: ‎1-data-crawling/crawlers/__init__.py
+5
diff --git a/Diff for: ‎1-data-crawling/crawlers/base.py
+66 b/Diff for: ‎1-data-crawling/crawlers/base.py
+66
diff --git a/Diff for: ‎1-data-crawling/crawlers/github.py
+57 b/Diff for: ‎1-data-crawling/crawlers/github.py
+57
diff --git a/Diff for: ‎1-data-crawling/crawlers/linkedin.py
+156 b/Diff for: ‎1-data-crawling/crawlers/linkedin.py
+156
diff --git a/Diff for: ‎1-data-crawling/crawlers/medium.py
+43 b/Diff for: ‎1-data-crawling/crawlers/medium.py
+43
diff --git a/Diff for: ‎1-data-crawling/db/__init__.py b/Diff for: ‎1-data-crawling/db/__init__.py
@@ -0,0 +1 @@
+TBD
@@ -0,0 +1,17 @@
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(env_file="../.env", env_file_encoding="utf-8")
+
+    MONGO_DATABASE_HOST: str = (
+        "mongodb://mongo1:30001,mongo2:30002,mongo3:30003/?replicaSet=my-replica-set"
+    )
+    MONGO_DATABASE_NAME: str = "scrabble"
+
+    # Optional LinkedIn credentials for scraping your profile
+    LINKEDIN_USERNAME: str | None = None
+    LINKEDIN_PASSWORD: str | None = None
+
+
+settings = Settings()
@@ -0,0 +1,5 @@
+from .github import GithubCrawler
+from .linkedin import LinkedInCrawler
+from .medium import MediumCrawler
+
+__all__ = ["GithubCrawler", "LinkedInCrawler", "MediumCrawler"]
@@ -0,0 +1,66 @@
+import time
+from abc import ABC, abstractmethod
+from tempfile import mkdtemp
+
+from db.documents import BaseDocument
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+
+
+class BaseCrawler(ABC):
+    model: type[BaseDocument]
+
+    @abstractmethod
+    def extract(self, link: str, **kwargs) -> None: ...
+
+
+class BaseAbstractCrawler(BaseCrawler, ABC):
+    def __init__(self, scroll_limit: int = 5) -> None:
+        options = webdriver.ChromeOptions()
+        options.binary_location = "/opt/chrome/chrome"
+        options.add_argument("--no-sandbox")
+        options.add_argument("--headless=new")
+        options.add_argument("--single-process")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_argument("--disable-gpu")
+        options.add_argument("--log-level=3")
+        options.add_argument("--disable-popup-blocking")
+        options.add_argument("--disable-notifications")
+        options.add_argument("--disable-dev-tools")
+        options.add_argument("--ignore-certificate-errors")
+        options.add_argument("--no-zygote")
+        options.add_argument(f"--user-data-dir={mkdtemp()}")
+        options.add_argument(f"--data-path={mkdtemp()}")
+        options.add_argument(f"--disk-cache-dir={mkdtemp()}")
+        options.add_argument("--remote-debugging-port=9222")
+
+        self.set_extra_driver_options(options)
+
+        self.scroll_limit = scroll_limit
+        self.driver = webdriver.Chrome(
+            service=webdriver.ChromeService("/opt/chromedriver"),
+            options=options,
+        )
+
+    def set_extra_driver_options(self, options: Options) -> None:
+        pass
+
+    def login(self) -> None:
+        pass
+
+    def scroll_page(self) -> None:
+        """Scroll through the LinkedIn page based on the scroll limit."""
+        current_scroll = 0
+        last_height = self.driver.execute_script("return document.body.scrollHeight")
+        while True:
+            self.driver.execute_script(
+                "window.scrollTo(0, document.body.scrollHeight);"
+            )
+            time.sleep(5)
+            new_height = self.driver.execute_script("return document.body.scrollHeight")
+            if new_height == last_height or (
+                self.scroll_limit and current_scroll >= self.scroll_limit
+            ):
+                break
+            last_height = new_height
+            current_scroll += 1
@@ -0,0 +1,57 @@
+import os
+import shutil
+import subprocess
+import tempfile
+
+from aws_lambda_powertools import Logger
+
+from crawlers.base import BaseCrawler
+from db.documents import RepositoryDocument
+
+logger = Logger(service="llm-twin-course/crawler")
+
+
+class GithubCrawler(BaseCrawler):
+    model = RepositoryDocument
+
+    def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
+        super().__init__()
+        self._ignore = ignore
+
+    def extract(self, link: str, **kwargs) -> None:
+        logger.info(f"Starting scrapping GitHub repository: {link}")
+
+        repo_name = link.rstrip("/").split("/")[-1]
+
+        local_temp = tempfile.mkdtemp()
+
+        try:
+            os.chdir(local_temp)
+            subprocess.run(["git", "clone", link])
+
+            repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])
+
+            tree = {}
+            for root, dirs, files in os.walk(repo_path):
+                dir = root.replace(repo_path, "").lstrip("/")
+                if dir.startswith(self._ignore):
+                    continue
+
+                for file in files:
+                    if file.endswith(self._ignore):
+                        continue
+                    file_path = os.path.join(dir, file)
+                    with open(os.path.join(root, file), "r", errors="ignore") as f:
+                        tree[file_path] = f.read().replace(" ", "")
+
+            instance = self.model(
+                name=repo_name, link=link, content=tree, owner_id=kwargs.get("user")
+            )
+            instance.save()
+
+        except Exception:
+            raise
+        finally:
+            shutil.rmtree(local_temp)
+
+        logger.info(f"Finished scrapping GitHub repository: {link}")
@@ -0,0 +1,156 @@
+import time
+from typing import Dict, List
+
+from aws_lambda_powertools import Logger
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+from errors import ImproperlyConfigured
+from selenium.webdriver.common.by import By
+
+from db.documents import PostDocument
+from crawlers.base import BaseAbstractCrawler
+from config import settings
+
+logger = Logger(service="decodingml/crawler")
+
+
+class LinkedInCrawler(BaseAbstractCrawler):
+    model = PostDocument
+
+    def set_extra_driver_options(self, options) -> None:
+        options.add_experimental_option("detach", True)
+
+    def extract(self, link: str, **kwargs):
+        logger.info(f"Starting scrapping data for profile: {link}")
+
+        self.login()
+
+        soup = self._get_page_content(link)
+
+        data = {
+            "Name": self._scrape_section(soup, "h1", class_="text-heading-xlarge"),
+            "About": self._scrape_section(soup, "div", class_="display-flex ph5 pv3"),
+            "Main Page": self._scrape_section(soup, "div", {"id": "main-content"}),
+            "Experience": self._scrape_experience(link),
+            "Education": self._scrape_education(link),
+        }
+
+        self.driver.get(link)
+        time.sleep(5)
+        button = self.driver.find_element(
+            By.CSS_SELECTOR,
+            ".app-aware-link.profile-creator-shared-content-view__footer-action",
+        )
+        button.click()
+
+        # Scrolling and scraping posts
+        self.scroll_page()
+        soup = BeautifulSoup(self.driver.page_source, "html.parser")
+        post_elements = soup.find_all(
+            "div",
+            class_="update-components-text relative update-components-update-v2__commentary",
+        )
+        buttons = soup.find_all("button", class_="update-components-image__image-link")
+        post_images = self._extract_image_urls(buttons)
+
+        posts = self._extract_posts(post_elements, post_images)
+        logger.info(f"Found {len(posts)} posts for profile: {link}")
+
+        self.driver.close()
+
+        self.model.bulk_insert(
+            [
+                PostDocument(
+                    platform="linkedin", content=post, author_id=kwargs.get("user")
+                )
+                for post in posts
+            ]
+        )
+
+        logger.info(f"Finished scrapping data for profile: {link}")
+
+    def _scrape_section(self, soup: BeautifulSoup, *args, **kwargs) -> str:
+        """Scrape a specific section of the LinkedIn profile."""
+        # Example: Scrape the 'About' section
+        parent_div = soup.find(*args, **kwargs)
+        return parent_div.get_text(strip=True) if parent_div else ""
+
+    def _extract_image_urls(self, buttons: List[Tag]) -> Dict[str, str]:
+        """
+        Extracts image URLs from button elements.
+
+        Args:
+            buttons (List[Tag]): A list of BeautifulSoup Tag objects representing buttons.
+
+        Returns:
+            Dict[str, str]: A dictionary mapping post indexes to image URLs.
+        """
+        post_images = {}
+        for i, button in enumerate(buttons):
+            img_tag = button.find("img")
+            if img_tag and "src" in img_tag.attrs:
+                post_images[f"Post_{i}"] = img_tag["src"]
+            else:
+                logger.warning("No image found in this button")
+        return post_images
+
+    def _get_page_content(self, url: str) -> BeautifulSoup:
+        """Retrieve the page content of a given URL."""
+        self.driver.get(url)
+        time.sleep(5)
+        return BeautifulSoup(self.driver.page_source, "html.parser")
+
+    def _extract_posts(
+        self, post_elements: List[Tag], post_images: Dict[str, str]
+    ) -> Dict[str, Dict[str, str]]:
+        """
+        Extracts post texts and combines them with their respective images.
+
+        Args:
+            post_elements (List[Tag]): A list of BeautifulSoup Tag objects representing post elements.
+            post_images (Dict[str, str]): A dictionary containing image URLs mapped by post index.
+
+        Returns:
+            Dict[str, Dict[str, str]]: A dictionary containing post data with text and optional image URL.
+        """
+        posts_data = {}
+        for i, post_element in enumerate(post_elements):
+            post_text = post_element.get_text(strip=True, separator="\n")
+            post_data = {"text": post_text}
+            if f"Post_{i}" in post_images:
+                post_data["image"] = post_images[f"Post_{i}"]
+            posts_data[f"Post_{i}"] = post_data
+        return posts_data
+
+    def _scrape_experience(self, profile_url: str) -> str:
+        """Scrapes the Experience section of the LinkedIn profile."""
+        self.driver.get(profile_url + "/details/experience/")
+        time.sleep(5)
+        soup = BeautifulSoup(self.driver.page_source, "html.parser")
+        experience_content = soup.find("section", {"id": "experience-section"})
+        return experience_content.get_text(strip=True) if experience_content else ""
+
+    def _scrape_education(self, profile_url: str) -> str:
+        self.driver.get(profile_url + "/details/education/")
+        time.sleep(5)
+        soup = BeautifulSoup(self.driver.page_source, "html.parser")
+        education_content = soup.find("section", {"id": "education-section"})
+        return education_content.get_text(strip=True) if education_content else ""
+
+    def login(self):
+        """Log in to LinkedIn."""
+        self.driver.get("https://www.linkedin.com/login")
+        if not settings.LINKEDIN_USERNAME and not settings.LINKEDIN_PASSWORD:
+            raise ImproperlyConfigured(
+                "LinkedIn scraper requires an valid account to perform extraction"
+            )
+
+        self.driver.find_element(By.ID, "username").send_keys(
+            settings.LINKEDIN_USERNAME
+        )
+        self.driver.find_element(By.ID, "password").send_keys(
+            settings.LINKEDIN_PASSWORD
+        )
+        self.driver.find_element(
+            By.CSS_SELECTOR, ".login__form_action_container button"
+        ).click()
@@ -0,0 +1,43 @@
+from aws_lambda_powertools import Logger
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+
+from crawlers.base import BaseAbstractCrawler
+from db.documents import ArticleDocument
+
+logger = Logger(service="decodingml/crawler")
+
+
+class MediumCrawler(BaseAbstractCrawler):
+    model = ArticleDocument
+
+    def set_extra_driver_options(self, options) -> None:
+        options.add_argument(r"--profile-directory=Profile 2")
+
+    def extract(self, link: str, **kwargs) -> None:
+        logger.info(f"Starting scrapping Medium article: {link}")
+
+        self.driver.get(link)
+        self.scroll_page()
+
+        soup = BeautifulSoup(self.driver.page_source, "html.parser")
+        title = soup.find_all("h1", class_="pw-post-title")
+        subtitle = soup.find_all("h2", class_="pw-subtitle-paragraph")
+
+        data = {
+            "Title": title[0].string if title else None,
+            "Subtitle": subtitle[0].string if subtitle else None,
+            "Content": soup.get_text(),
+        }
+
+        logger.info(f"Successfully scraped and saved article: {link}")
+        self.driver.close()
+        instance = self.model(
+            platform="medium", content=data, link=link, author_id=kwargs.get("user")
+        )
+        instance.save()
+
+    def login(self):
+        """Log in to Medium with Google"""
+        self.driver.get("https://medium.com/m/signin")
+        self.driver.find_element(By.TAG_NAME, "a").click()