insoftai
diff --git a/Diff for: ‎2-data-ingestion/cdc.py
+45 b/Diff for: ‎2-data-ingestion/cdc.py
+45
diff --git a/Diff for: ‎2-data-ingestion/config.py
+18 b/Diff for: ‎2-data-ingestion/config.py
+18
diff --git a/Diff for: ‎2-data-ingestion/db.py
+34 b/Diff for: ‎2-data-ingestion/db.py
+34
diff --git a/Diff for: ‎2-data-ingestion/mq.py
+111 b/Diff for: ‎2-data-ingestion/mq.py
+111
diff --git a/Diff for: ‎2-data-ingestion/test_cdc.py
+32 b/Diff for: ‎2-data-ingestion/test_cdc.py
+32
diff --git a/Diff for: ‎config.py
+17 b/Diff for: ‎config.py
+17
diff --git a/Diff for: ‎crawlers/__init__.py b/Diff for: ‎crawlers/__init__.py
diff --git a/Diff for: ‎crawlers/base.py
+66 b/Diff for: ‎crawlers/base.py
+66
diff --git a/Diff for: ‎crawlers/github.py
+57 b/Diff for: ‎crawlers/github.py
+57
@@ -0,0 +1,45 @@
+import json
+import logging
+
+from bson import json_util
+from mq import publish_to_rabbitmq
+
+from config import settings
+from db import MongoDatabaseConnector
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+def stream_process():
+    try:
+        # Setup MongoDB connection
+        client = MongoDatabaseConnector()
+        db = client["scrabble"]
+        logging.info("Connected to MongoDB.")
+
+        # Watch changes in a specific collection
+        changes = db.watch([{"$match": {"operationType": {"$in": ["insert"]}}}])
+        for change in changes:
+            data_type = change["ns"]["coll"]
+            entry_id = str(change["fullDocument"]["_id"])  # Convert ObjectId to string
+            change["fullDocument"].pop("_id")
+            change["fullDocument"]["type"] = data_type
+            change["fullDocument"]["entry_id"] = entry_id
+
+            # Use json_util to serialize the document
+            data = json.dumps(change["fullDocument"], default=json_util.default)
+            logging.info(f"Change detected and serialized: {data}")
+
+            # Send data to rabbitmq
+            publish_to_rabbitmq(queue_name=settings.RABBITMQ_QUEUE_NAME, data=data)
+            logging.info("Data published to RabbitMQ.")
+
+    except Exception as e:
+        logging.error(f"An error occurred: {e}")
+
+
+if __name__ == "__main__":
+    stream_process()
@@ -0,0 +1,18 @@
+from pydantic_settings import BaseSettings
+
+
+class Settings(BaseSettings):
+    # MongoDB configs
+    MONGO_DATABASE_HOST: str = (
+        "mongodb://mongo1:30001,mongo2:30002,mongo3:30003/?replicaSet=my-replica-set"
+    )
+    MONGO_DATABASE_NAME: str = "scrabble"
+
+    RABBITMQ_HOST: str = "mq"  # or localhost if running outside Docker
+    RABBITMQ_PORT: int = 5672
+    RABBITMQ_DEFAULT_USERNAME: str = "guest"
+    RABBITMQ_DEFAULT_PASSWORD: str = "guest"
+    RABBITMQ_QUEUE_NAME: str = "default"
+
+
+settings = Settings()
@@ -0,0 +1,34 @@
+from pymongo import MongoClient
+from pymongo.errors import ConnectionFailure
+
+from config import settings
+
+
+class MongoDatabaseConnector:
+    """Singleton class to connect to MongoDB database."""
+
+    _instance: MongoClient = None
+
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            try:
+                cls._instance = MongoClient(settings.MONGO_DATABASE_HOST)
+            except ConnectionFailure as e:
+                print(f"Couldn't connect to the database: {str(e)}")
+                raise
+
+        print(
+            f"Connection to database with uri: {settings.MONGO_DATABASE_HOST} successful"
+        )
+        return cls._instance
+
+    def get_database(self):
+        return self._instance[settings.MONGO_DATABASE_NAME]
+
+    def close(self):
+        if self._instance:
+            self._instance.close()
+            print("Connected to database has been closed.")
+
+
+connection = MongoDatabaseConnector()
@@ -0,0 +1,111 @@
+import pika
+
+from config import settings
+
+
+class RabbitMQConnection:
+    """Singleton class to manage RabbitMQ connection."""
+
+    _instance = None
+
+    def __new__(
+        cls,
+        host: str = None,
+        port: int = None,
+        username: str = None,
+        password: str = None,
+        virtual_host: str = "/",
+    ):
+        if not cls._instance:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __init__(
+        self,
+        host: str = None,
+        port: int = None,
+        username: str = None,
+        password: str = None,
+        virtual_host: str = "/",
+        fail_silently: bool = False,
+        **kwargs,
+    ):
+        self.host = host or settings.RABBITMQ_HOST
+        self.port = port or settings.RABBITMQ_PORT
+        self.username = username or settings.RABBITMQ_DEFAULT_USERNAME
+        self.password = password or settings.RABBITMQ_DEFAULT_PASSWORD
+        self.virtual_host = virtual_host
+        self.fail_silently = fail_silently
+        self._connection = None
+
+    def __enter__(self):
+        self.connect()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def connect(self):
+        try:
+            credentials = pika.PlainCredentials(self.username, self.password)
+            self._connection = pika.BlockingConnection(
+                pika.ConnectionParameters(
+                    host=self.host,
+                    port=self.port,
+                    virtual_host=self.virtual_host,
+                    credentials=credentials,
+                )
+            )
+        except pika.exceptions.AMQPConnectionError as e:
+            print("Failed to connect to RabbitMQ:", e)
+            if not self.fail_silently:
+                raise e
+
+    def is_connected(self) -> bool:
+        return self._connection is not None and self._connection.is_open
+
+    def get_channel(self):
+        if self.is_connected():
+            return self._connection.channel()
+
+    def close(self):
+        if self.is_connected():
+            self._connection.close()
+            self._connection = None
+            print("Closed RabbitMQ connection")
+
+
+def publish_to_rabbitmq(queue_name: str, data: str):
+    """Publish data to a RabbitMQ queue."""
+    try:
+        # Create an instance of RabbitMQConnection
+        rabbitmq_conn = RabbitMQConnection()
+
+        # Establish connection
+        with rabbitmq_conn:
+            channel = rabbitmq_conn.get_channel()
+
+            # Ensure the queue exists
+            channel.queue_declare(queue=queue_name, durable=True)
+
+            # Delivery confirmation
+            channel.confirm_delivery()
+
+            # Send data to the queue
+            channel.basic_publish(
+                exchange="",
+                routing_key=queue_name,
+                body=data,
+                properties=pika.BasicProperties(
+                    delivery_mode=2,  # make message persistent
+                ),
+            )
+            print("Sent data to RabbitMQ:", data)
+    except pika.exceptions.UnroutableError:
+        print("Message could not be routed")
+    except Exception as e:
+        print(f"Error publishing to RabbitMQ: {e}")
+
+
+if __name__ == "__main__":
+    publish_to_rabbitmq("test_queue", "Hello, World!")
@@ -0,0 +1,32 @@
+from pymongo import MongoClient
+
+
+def insert_data_to_mongodb(uri, database_name, collection_name, data):
+    """
+    Insert data into a MongoDB collection.
+
+    :param uri: MongoDB URI
+    :param database_name: Name of the database
+    :param collection_name: Name of the collection
+    :param data: Data to be inserted (dict)
+    """
+    client = MongoClient(uri)
+    db = client[database_name]
+    collection = db[collection_name]
+
+    try:
+        result = collection.insert_one(data)
+        print(f"Data inserted with _id: {result.inserted_id}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    finally:
+        client.close()
+
+
+if __name__ == "__main__":
+    insert_data_to_mongodb(
+        "mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set",
+        "scrabble",
+        "posts",
+        {"platform": "linkedin", "content": "Test content"}
+    )
@@ -0,0 +1,17 @@
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(env_file="../.env", env_file_encoding="utf-8")
+
+    MONGO_DATABASE_HOST: str = (
+        "mongodb://mongo1:30001,mongo2:30002,mongo3:30003/?replicaSet=my-replica-set"
+    )
+    MONGO_DATABASE_NAME: str = "scrabble"
+
+    # Optional LinkedIn credentials for scraping your profile
+    LINKEDIN_USERNAME: str | None = None
+    LINKEDIN_PASSWORD: str | None = None
+
+
+settings = Settings()
@@ -0,0 +1,66 @@
+import time
+from abc import ABC, abstractmethod
+from tempfile import mkdtemp
+
+from db.documents import BaseDocument
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+
+
+class BaseCrawler(ABC):
+    model: type[BaseDocument]
+
+    @abstractmethod
+    def extract(self, link: str, **kwargs) -> None: ...
+
+
+class BaseAbstractCrawler(BaseCrawler, ABC):
+    def __init__(self, scroll_limit: int = 5) -> None:
+        options = webdriver.ChromeOptions()
+        options.binary_location = "/opt/chrome/chrome"
+        options.add_argument("--no-sandbox")
+        options.add_argument("--headless=new")
+        options.add_argument("--single-process")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_argument("--disable-gpu")
+        options.add_argument("--log-level=3")
+        options.add_argument("--disable-popup-blocking")
+        options.add_argument("--disable-notifications")
+        options.add_argument("--disable-dev-tools")
+        options.add_argument("--ignore-certificate-errors")
+        options.add_argument("--no-zygote")
+        options.add_argument(f"--user-data-dir={mkdtemp()}")
+        options.add_argument(f"--data-path={mkdtemp()}")
+        options.add_argument(f"--disk-cache-dir={mkdtemp()}")
+        options.add_argument("--remote-debugging-port=9222")
+
+        self.set_extra_driver_options(options)
+
+        self.scroll_limit = scroll_limit
+        self.driver = webdriver.Chrome(
+            service=webdriver.ChromeService("/opt/chromedriver"),
+            options=options,
+        )
+
+    def set_extra_driver_options(self, options: Options) -> None:
+        pass
+
+    def login(self) -> None:
+        pass
+
+    def scroll_page(self) -> None:
+        """Scroll through the LinkedIn page based on the scroll limit."""
+        current_scroll = 0
+        last_height = self.driver.execute_script("return document.body.scrollHeight")
+        while True:
+            self.driver.execute_script(
+                "window.scrollTo(0, document.body.scrollHeight);"
+            )
+            time.sleep(5)
+            new_height = self.driver.execute_script("return document.body.scrollHeight")
+            if new_height == last_height or (
+                self.scroll_limit and current_scroll >= self.scroll_limit
+            ):
+                break
+            last_height = new_height
+            current_scroll += 1
@@ -0,0 +1,57 @@
+import os
+import shutil
+import subprocess
+import tempfile
+
+from aws_lambda_powertools import Logger
+
+from crawlers.base import BaseCrawler
+from db.documents import RepositoryDocument
+
+logger = Logger(service="llm-twin-course/crawler")
+
+
+class GithubCrawler(BaseCrawler):
+    model = RepositoryDocument
+
+    def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
+        super().__init__()
+        self._ignore = ignore
+
+    def extract(self, link: str, **kwargs) -> None:
+        logger.info(f"Starting scrapping GitHub repository: {link}")
+
+        repo_name = link.rstrip("/").split("/")[-1]
+
+        local_temp = tempfile.mkdtemp()
+
+        try:
+            os.chdir(local_temp)
+            subprocess.run(["git", "clone", link])
+
+            repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])
+
+            tree = {}
+            for root, dirs, files in os.walk(repo_path):
+                dir = root.replace(repo_path, "").lstrip("/")
+                if dir.startswith(self._ignore):
+                    continue
+
+                for file in files:
+                    if file.endswith(self._ignore):
+                        continue
+                    file_path = os.path.join(dir, file)
+                    with open(os.path.join(root, file), "r", errors="ignore") as f:
+                        tree[file_path] = f.read().replace(" ", "")
+
+            instance = self.model(
+                name=repo_name, link=link, content=tree, owner_id=kwargs.get("user")
+            )
+            instance.save()
+
+        except Exception:
+            raise
+        finally:
+            shutil.rmtree(local_temp)
+
+        logger.info(f"Finished scrapping GitHub repository: {link}")