From 4db1792b70bc6589092a2bd5e96e5cf3b6a95a04 Mon Sep 17 00:00:00 2001
From: Fazeel Nadeem <fazeel.bhatti@upstartcommerce.com>
Date: Wed, 19 Mar 2025 01:11:00 +0500
Subject: [PATCH 1/5] working gemini scrapper coded by cursor agent

---
 src/custom_prompt.txt |  51 +++++++++
 src/doc_crawler.py    | 130 ++++++++++++++++-------
 src/gemini_helper.py  | 233 ++++++++++++++++++++++++++++++++++++++++++
 src/main.py           |  47 ++++++---
 4 files changed, 409 insertions(+), 52 deletions(-)
 create mode 100644 src/custom_prompt.txt
 create mode 100644 src/gemini_helper.py

diff --git a/src/custom_prompt.txt b/src/custom_prompt.txt
new file mode 100644
index 0000000..3ae430e
--- /dev/null
+++ b/src/custom_prompt.txt
@@ -0,0 +1,51 @@
+You are an expert REST API documentation specialist. Format this Stoplight-hosted API documentation into clean, well-structured markdown.
+
+Focus on:
+• RESTful endpoint details and HTTP methods
+• Request/response structures and examples
+• Authentication requirements
+• Query parameters and path variables
+• Request headers and body schemas
+• Response codes and examples
+• Rate limiting and pagination details
+
+Use these formatting rules:
+• Group endpoints by resource type or service
+• Use markdown tables for parameters, headers, and response fields
+• Format JSON examples with proper syntax highlighting
+• Use blockquotes for important notes, warnings, and prerequisites
+• Include curl examples for each endpoint
+• Preserve all data types and validation rules
+• Maintain accurate status codes and descriptions
+• Use consistent heading hierarchy:
+  - H1 for API resource/service name
+  - H2 for endpoint groups
+  - H3 for individual endpoints
+  - H4 for request/response sections
+
+Structure each endpoint as:
+1. Endpoint Overview
+   - HTTP Method and Path
+   - Brief Description
+   - Authentication Requirements
+
+2. Request Details
+   - Headers
+   - Path Parameters
+   - Query Parameters
+   - Request Body Schema
+   - Example Request
+
+3. Response Details
+   - Success Response
+   - Error Responses
+   - Response Schema
+   - Example Response
+
+4. Usage Notes
+   - Rate Limits
+   - Pagination Details
+   - Special Considerations
+   - Related Endpoints
+
+Preserve all technical accuracy while making the documentation clear and developer-friendly. Format code examples as proper markdown code blocks with language specification.
\ No newline at end of file
diff --git a/src/doc_crawler.py b/src/doc_crawler.py
index cf3e3ab..4c07317 100644
--- a/src/doc_crawler.py
+++ b/src/doc_crawler.py
@@ -2,15 +2,19 @@
 import logging
 import asyncio
 from urllib.parse import urljoin, urlparse
-from typing import Set, Dict
+from typing import Set, Dict, Literal, Optional
 from gpt_helper import GPTHelper
+from gemini_helper import GeminiHelper
 from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service as ChromeService
+from selenium.webdriver.firefox.service import Service as FirefoxService
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+from selenium.webdriver.firefox.options import Options as FirefoxOptions
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.by import By
 from webdriver_manager.chrome import ChromeDriverManager
+#import geckodriver_autoinstaller
 import time
 
 logging.basicConfig(level=logging.INFO)
@@ -21,42 +25,33 @@ class DocCrawler:
     def __init__(
         self,
         base_url: str,
-        output_dir: str = "coinbase_docs",
-        max_pages: int = 50,
-        max_concurrent_pages: int = 3
+        output_dir: str,
+        concurrent_pages: int = 3,
+        model_type: str = "gpt",
+        browser_type: str = "chrome",
+        custom_prompt: Optional[str] = None,
+        max_pages: int = 1000  # Add default max pages limit
     ):
         self.base_url = base_url
         self.base_domain = urlparse(base_url).netloc
         self.output_dir = output_dir
-        self.max_pages = max_pages
-        self.max_concurrent_pages = max_concurrent_pages
+        self.concurrent_pages = concurrent_pages
+        self.model_type = model_type
+        self.browser_type = browser_type
+        self.custom_prompt = custom_prompt
+        self.max_pages = max_pages  # Initialize max_pages
         self.visited_urls: Set[str] = set()
         self.processed_content: Dict[str, str] = {}
-        self.gpt_helper = GPTHelper()
-        self._page_semaphore = asyncio.Semaphore(max_concurrent_pages)
-
-        # Initialize Selenium
-        chrome_options = Options()
-        chrome_options.add_argument("--headless")
-        chrome_options.add_argument("--no-sandbox")
-        chrome_options.add_argument("--disable-dev-shm-usage")
-        chrome_options.add_argument("--disable-gpu")
-        chrome_options.add_argument("--window-size=1920,1080")
-        chrome_options.add_argument("--ignore-certificate-errors")
-        chrome_options.add_argument("--disable-extensions")
-        chrome_options.add_argument("--disable-web-security")  # Be careful with this in production
-        chrome_options.add_argument("--allow-running-insecure-content")
-        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
-        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
-
-        self.driver = webdriver.Chrome(
-            service=Service(ChromeDriverManager().install()),
-            options=chrome_options
-        )
-
-        # Set page load timeout
-        self.driver.set_page_load_timeout(90)
-
+        self.url_queue = asyncio.Queue()
+        self.results = []
+        
+        # Setup components
+        self._setup_logging()
+        self._setup_driver()
+        self._setup_ai_helper()
+        
+        self._page_semaphore = asyncio.Semaphore(concurrent_pages)
+        
         os.makedirs(output_dir, exist_ok=True)
 
         # Add progress tracking
@@ -64,7 +59,9 @@ def __init__(
         logger.info(f"Initializing crawler for {base_url}")
         logger.info(f"Output directory: {output_dir}")
         logger.info(f"Maximum pages to crawl: {max_pages}")
-        logger.info(f"Maximum concurrent pages: {max_concurrent_pages}")
+        logger.info(f"Concurrent pages: {concurrent_pages}")
+        logger.info(f"Using AI model: {model_type}")
+        logger.info(f"Using browser: {browser_type}")
 
     def __del__(self):
         """Clean up Selenium driver"""
@@ -227,7 +224,7 @@ async def process_page(self, url: str) -> None:
                 # Process content
                 try:
                     logger.info("Sending to GPT for formatting...")
-                    formatted_content = await self.gpt_helper.format_documentation(content)
+                    formatted_content = await self.ai_helper.format_documentation(content)
                     if formatted_content:
                         logger.info(f"Content formatting successful ({len(formatted_content)} characters)")
                         self.processed_content[url] = formatted_content
@@ -301,7 +298,7 @@ async def crawl(self) -> None:
 
             # Process up to max_concurrent_pages pages in parallel
             current_batch = []
-            while urls_to_visit and len(current_batch) < self.max_concurrent_pages:
+            while urls_to_visit and len(current_batch) < self.concurrent_pages:
                 url = urls_to_visit.pop()
                 if url not in self.visited_urls:
                     current_batch.append(url)
@@ -329,7 +326,7 @@ async def crawl(self) -> None:
                 current_content = f.read()
             
             # Perform the final review
-            reviewed_content = await self.gpt_helper.final_review(current_content)
+            reviewed_content = await self.ai_helper.final_review(current_content)
             
             # Save the reviewed documentation
             with open(output_file, 'w', encoding='utf-8') as f:
@@ -338,3 +335,62 @@ async def crawl(self) -> None:
             logger.info("Final documentation review completed!")
 
         logger.info("Crawl completed!")
+
+    def _setup_ai_helper(self):
+        """Set up the appropriate AI helper based on model type."""
+        if self.model_type == "gpt":
+            self.ai_helper = GPTHelper()
+        else:
+            self.ai_helper = GeminiHelper(custom_prompt=self.custom_prompt)
+
+    def _setup_logging(self):
+        """Set up logging configuration."""
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(logging.INFO)
+
+    def _setup_driver(self):
+        """Set up the Selenium WebDriver."""
+        # Initialize Selenium based on browser type
+        if self.browser_type == "chrome":
+            chrome_options = ChromeOptions()
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--no-sandbox")
+            chrome_options.add_argument("--disable-dev-shm-usage")
+            chrome_options.add_argument("--disable-gpu")
+            chrome_options.add_argument("--window-size=1920,1080")
+            chrome_options.add_argument("--ignore-certificate-errors")
+            chrome_options.add_argument("--disable-extensions")
+            chrome_options.add_argument("--disable-web-security")
+            chrome_options.add_argument("--allow-running-insecure-content")
+            chrome_options.add_argument("--disable-blink-features=AutomationControlled")
+            chrome_options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+
+            self.driver = webdriver.Chrome(
+                service=ChromeService(ChromeDriverManager().install()),
+                options=chrome_options
+            )
+        else:  # firefox
+            # Install geckodriver if not present
+            #geckodriver_autoinstaller.install()
+            
+            firefox_options = FirefoxOptions()
+            firefox_options.add_argument("--headless")
+            firefox_options.add_argument("--width=1920")
+            firefox_options.add_argument("--height=1080")
+            firefox_options.add_argument("--disable-gpu")
+            firefox_options.add_argument("--no-sandbox")
+            firefox_options.add_argument("--disable-dev-shm-usage")
+            firefox_options.add_argument("--disable-extensions")
+            firefox_options.add_argument("--disable-web-security")
+            firefox_options.add_argument("--allow-running-insecure-content")
+            firefox_options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/115.0")
+
+            self.driver = webdriver.Firefox(
+                service=FirefoxService(),
+                options=firefox_options
+            )
+
+        # Set page load timeout
+        self.driver.set_page_load_timeout(90)
+
+        os.makedirs(self.output_dir, exist_ok=True)
diff --git a/src/gemini_helper.py b/src/gemini_helper.py
new file mode 100644
index 0000000..d697ce9
--- /dev/null
+++ b/src/gemini_helper.py
@@ -0,0 +1,233 @@
+import google.generativeai as genai
+import os
+import time
+import logging
+import asyncio
+from dotenv import load_dotenv
+from typing import Optional, List
+from time import perf_counter
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+
+class GeminiHelper:
+    def __init__(self, custom_prompt: Optional[str] = None):
+        self.api_key = os.getenv('GOOGLE_API_KEY')
+        if not self.api_key:
+            raise ValueError("GOOGLE_API_KEY not found in environment variables")
+        
+        # Store custom prompt if provided
+        self.custom_prompt = custom_prompt
+        if self.custom_prompt:
+            logger.info("Using custom prompt for documentation processing")
+        
+        # Configure the Gemini API
+        genai.configure(api_key=self.api_key)
+        
+        # Initialize the model
+        self.model = genai.GenerativeModel('gemini-2.0-flash')
+        self.chat = self.model.start_chat(history=[])
+        
+        self.max_retries = 3
+        self.chunk_size = 2000
+        self.timeout = 90
+        self.total_api_time = 0
+        self.total_api_calls = 0
+        self.max_concurrent_calls = 2
+        self._semaphore = asyncio.Semaphore(self.max_concurrent_calls)
+        logger.info(f"GeminiHelper initialized (max {self.max_concurrent_calls} concurrent calls)")
+
+    async def _call_gemini(self, content: str, retries: int = 0, system_message: str = None) -> Optional[str]:
+        """Make a single Gemini API call with retry logic."""
+        async with self._semaphore:
+            try:
+                logger.info(f"Making Gemini API call (attempt {retries + 1})")
+                start_time = perf_counter()
+
+                # Use custom prompt if available, otherwise use default system message
+                if self.custom_prompt:
+                    system_message = self.custom_prompt
+                elif system_message is None:
+                    system_message = """You are an expert Apple framework documentation engineer. Format this documentation chunk into clean markdown.
+Focus on:
+• Framework overview and concepts
+• Types, protocols, and class hierarchies
+• Method and property declarations
+• Code examples and usage patterns
+• Best practices and implementation guidelines
+
+Use these formatting rules:
+• Use Apple-style hierarchical headings
+• Format Swift code blocks with proper syntax highlighting
+• Use tables for parameter and return value descriptions
+• Use blockquotes for important notes and warnings
+• Preserve all declaration syntax and type information
+• Keep working code examples
+• Maintain Apple's technical accuracy and terminology
+• Include relevant privacy and entitlement requirements
+• Preserve framework version and availability information
+• Format symbol references with proper linking syntax
+
+Structure sections as:
+1. Overview/Introduction
+2. Topics
+3. Declarations
+4. Discussion
+5. Parameters/Return Value
+6. See Also/Related"""
+
+                try:
+                    # Start a new chat for each call to ensure clean context
+                    chat = self.model.start_chat(history=[])
+                    
+                    # Send system message first
+                    await asyncio.to_thread(chat.send_message, system_message)
+                    
+                    # Send the actual content
+                    response = await asyncio.to_thread(chat.send_message, content)
+                    
+                    end_time = perf_counter()
+                    duration = end_time - start_time
+                    self.total_api_time += duration
+                    self.total_api_calls += 1
+                    avg_time = self.total_api_time / self.total_api_calls
+                    
+                    logger.info(f"Gemini API call successful - Took {duration:.2f}s (Avg: {avg_time:.2f}s)")
+                    return response.text
+                    
+                except Exception as e:
+                    logger.error(f"Gemini API Error: {str(e)}")
+                    raise
+                
+            except Exception as e:
+                logger.error(f"Error in Gemini call: {str(e)}")
+                if retries < self.max_retries:
+                    await asyncio.sleep(2 ** retries)
+                    return await self._call_gemini(content, retries + 1, system_message)
+                return f"Error processing chunk: {str(e)}"
+
+    def _split_into_chunks(self, text: str) -> list[str]:
+        """Split text into processable chunks while preserving markdown structure."""
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        
+        blocks = text.split('\n\n')
+        logger.info(f"Splitting content into chunks (total blocks: {len(blocks)})")
+        
+        for block in blocks:
+            block_size = len(block)
+            
+            if current_size + block_size > self.chunk_size:
+                if current_chunk:
+                    chunks.append('\n\n'.join(current_chunk))
+                current_chunk = [block]
+                current_size = block_size
+            else:
+                current_chunk.append(block)
+                current_size += block_size
+        
+        if current_chunk:
+            chunks.append('\n\n'.join(current_chunk))
+        
+        logger.info(f"Created {len(chunks)} chunks")
+        return chunks
+
+    async def format_documentation(self, content: str) -> str:
+        """Process documentation chunks in parallel and combine results."""
+        try:
+            start_time = perf_counter()
+            logger.info("Starting documentation formatting")
+            chunks = self._split_into_chunks(content)
+            
+            # Process all chunks in parallel
+            tasks = [self._call_gemini(chunk) for chunk in chunks]
+            formatted_chunks = await asyncio.gather(*tasks)
+            formatted_chunks = [chunk for chunk in formatted_chunks if chunk]
+            
+            logger.info("Combining chunks")
+            combined = '\n\n---\n\n'.join(formatted_chunks)
+            
+            end_time = perf_counter()
+            total_duration = end_time - start_time
+            logger.info(f"Documentation formatting completed - Total time: {total_duration:.2f}s, API calls: {self.total_api_calls}, Avg API time: {self.total_api_time/self.total_api_calls:.2f}s")
+            return combined
+            
+        except Exception as e:
+            logger.error(f"Error in format_documentation: {str(e)}")
+            return f"Error formatting documentation: {str(e)}"
+
+    async def final_review(self, content: str) -> str:
+        """Perform a final review of the entire documentation."""
+        try:
+            logger.info("Starting final documentation review")
+            start_time = perf_counter()
+
+            # Split into larger chunks for final review since we're just cleaning up
+            original_chunk_size = self.chunk_size
+            self.chunk_size = 4000  # Temporarily increase chunk size
+            chunks = self._split_into_chunks(content)
+            self.chunk_size = original_chunk_size
+
+            review_tasks = []
+            for chunk in chunks:
+                task = self._call_gemini(
+                    chunk,
+                    system_message="""You are an expert technical documentation reviewer. Review and improve this API documentation chunk.
+Focus on:
+1. Removing any duplicate content
+2. Ensuring consistent formatting and style
+3. Making the documentation clear and readable
+4. Proper markdown formatting
+5. Consistent heading hierarchy
+6. Proper section breaks
+7. Complete and accurate endpoint documentation
+8. Consistent use of code blocks and tables
+9. Clear parameter descriptions
+10. Proper grouping of related endpoints
+
+Keep all valid API endpoint information but make it more concise and well-organized."""
+                )
+                review_tasks.append(task)
+
+            # Process review chunks in parallel
+            reviewed_chunks = await asyncio.gather(*review_tasks)
+            reviewed_chunks = [chunk for chunk in reviewed_chunks if chunk]
+            
+            # Combine reviewed chunks
+            logger.info("Combining reviewed chunks")
+            combined = '\n\n'.join(reviewed_chunks)
+
+            # Final pass to ensure consistency across the entire document
+            logger.info("Making final consistency pass")
+            final_content = await self._call_gemini(
+                combined,
+                system_message="""You are an expert technical documentation editor. This is the final pass of the API documentation.
+Your task is to ensure the entire document is consistent and well-organized.
+
+Focus on:
+1. Consistent structure throughout the document
+2. Clear and logical organization of endpoints
+3. Proper table of contents
+4. Consistent heading levels
+5. Remove any remaining duplicates
+6. Ensure all cross-references are valid
+7. Consistent formatting of endpoints, parameters, and examples
+8. Group related endpoints together
+9. Add clear section dividers
+10. Ensure all API information is accurate and complete
+
+Maintain all API endpoint information but make it as clear and well-organized as possible."""
+            )
+
+            end_time = perf_counter()
+            total_duration = end_time - start_time
+            logger.info(f"Final review completed - Total time: {total_duration:.2f}s")
+            
+            return final_content or combined
+
+        except Exception as e:
+            logger.error(f"Error in final review: {str(e)}")
+            return content  # Return original content if review fails 
\ No newline at end of file
diff --git a/src/main.py b/src/main.py
index ffdfe0e..7a92b36 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,24 +1,41 @@
-from doc_crawler import DocCrawler
-import asyncio
 import argparse
+import asyncio
+import os
+from doc_crawler import DocCrawler
 
-async def main():
-    parser = argparse.ArgumentParser(description='Scrape documentation from a website')
-    parser.add_argument('url', help='The base URL to scrape')
-    parser.add_argument('--output', '-o', default='output_docs', help='Output directory for docs')
-    parser.add_argument('--max-pages', '-m', type=int, default=1000, help='Maximum pages to scrape')
-    parser.add_argument('--concurrent', '-c', type=int, default=1, help='Number of concurrent pages to scrape')
+def main():
+    parser = argparse.ArgumentParser(description='Crawl and process documentation using AI')
+    parser.add_argument('url', help='The base URL to start crawling from')
+    parser.add_argument('--output', default='docs', help='Output directory for processed documentation')
+    parser.add_argument('--concurrent', type=int, default=3, help='Number of concurrent pages to scrape')
+    parser.add_argument('--model', choices=['gpt', 'gemini'], default='gpt', help='AI model to use for processing')
+    parser.add_argument('--browser', choices=['chrome', 'firefox'], default='chrome', help='Browser to use for scraping')
+    parser.add_argument('--custom-prompt', help='Path to a text file containing custom prompt for the AI model')
+    parser.add_argument('--max-pages', type=int, default=1000, help='Maximum number of pages to crawl')
     
     args = parser.parse_args()
-
+    
+    # Read custom prompt from file if specified
+    custom_prompt = None
+    if args.custom_prompt:
+        try:
+            with open(args.custom_prompt, 'r', encoding='utf-8') as f:
+                custom_prompt = f.read().strip()
+        except Exception as e:
+            print(f"Error reading custom prompt file: {str(e)}")
+            return
+    
     crawler = DocCrawler(
         base_url=args.url,
         output_dir=args.output,
-        max_pages=args.max_pages,
-        max_concurrent_pages=args.concurrent
+        concurrent_pages=args.concurrent,
+        model_type=args.model,
+        browser_type=args.browser,
+        custom_prompt=custom_prompt,
+        max_pages=args.max_pages
     )
+    
+    asyncio.run(crawler.crawl())
 
-    await crawler.crawl()
-
-if __name__ == "__main__":
-    asyncio.run(main()) 
\ No newline at end of file
+if __name__ == '__main__':
+    main() 
\ No newline at end of file

From b24427982cab8eb233007392e41ac69ce601e04d Mon Sep 17 00:00:00 2001
From: Fazeel Nadeem <fazeel.bhatti@upstartcommerce.com>
Date: Wed, 19 Mar 2025 01:11:33 +0500
Subject: [PATCH 2/5] updated readme, requirements

---
 .env.example     |  4 ----
 README.md        | 19 +++++++++++++++----
 requirements.txt |  4 +++-
 3 files changed, 18 insertions(+), 9 deletions(-)
 delete mode 100644 .env.example

diff --git a/.env.example b/.env.example
deleted file mode 100644
index cca9614..0000000
--- a/.env.example
+++ /dev/null
@@ -1,4 +0,0 @@
-# .env.example
-
-# Required for scraping
-OPENAI_API_KEY=YOUR_OPENAI_API_KEY
\ No newline at end of file
diff --git a/README.md b/README.md
index 293b69a..b46d0d3 100644
--- a/README.md
+++ b/README.md
@@ -22,12 +22,19 @@ The `-e` flag installs the package in "editable" mode, which means:
 
 ### Environment Setup
 
-Create a `.env` file in the project root:
+Create a `.env` file in the project root with your API key(s):
+
+For OpenAI GPT:
+```bash
+OPENAI_API_KEY=your_openai_api_key_here
+```
+
+For Google Gemini:
 ```bash
-OPENAI_API_KEY=your_api_key_here
+GOOGLE_API_KEY=your_google_api_key_here
 ```
 
-⚠️ The OpenAI API key is required for the crawler to process documentation.
+⚠️ At least one API key is required for the crawler to process documentation.
 
 ## Usage
 
@@ -43,10 +50,11 @@ python main.py https://docs.example.com
 - `-o, --output`: Output directory (default: output_docs)
 - `-m, --max-pages`: Maximum pages to scrape (default: 1000)
 - `-c, --concurrent`: Number of concurrent pages to scrape (default: 1)
+- `--model`: AI model to use for processing (default: "gpt", options: "gpt" or "gemini")
 
 Example with all options:
 ```bash
-python main.py https://docs.example.com -o my_docs -m 500 -c 2
+python main.py https://docs.example.com -o my_docs -m 500 -c 2 --model gemini
 ```
 
 ### Troubleshooting
@@ -63,8 +71,11 @@ The crawler accepts the following parameters:
 - `output_dir`: Directory where scraped docs will be saved
 - `max_pages`: Maximum number of pages to crawl
 - `max_concurrent_pages`: Number of concurrent pages to process
+- `model_type`: AI model to use ("gpt" or "gemini")
 
 ## Requirements
 
 - Python 3.8+
 - Chrome/Chromium browser (for Selenium)
+- OpenAI API key (for GPT model)
+- Google API key (for Gemini model)
diff --git a/requirements.txt b/requirements.txt
index 704f679..839cf8d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,6 @@ python-dotenv>=0.19.0
 pyyaml==6.0.1
 tqdm>=4.65.0
 outlines>=0.0.1
-chromadb>=0.4.18 
\ No newline at end of file
+chromadb>=0.4.18
+google-generativeai>=0.3.0
+geckodriver-autoinstaller>=0.1.0 
\ No newline at end of file

From 854588d30bbd0d5fd22d47351cb2c7deefcd2918 Mon Sep 17 00:00:00 2001
From: Fazeel Nadeem <fazeel.bhatti@upstartcommerce.com>
Date: Wed, 19 Mar 2025 01:13:51 +0500
Subject: [PATCH 3/5] added google-genai to requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 839cf8d..7ba0f27 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,5 +8,5 @@ pyyaml==6.0.1
 tqdm>=4.65.0
 outlines>=0.0.1
 chromadb>=0.4.18
-google-generativeai>=0.3.0
+google-genai>=1.5.0
 geckodriver-autoinstaller>=0.1.0 
\ No newline at end of file

From 454df320fc79f2772210fc225666dc0d8be81a66 Mon Sep 17 00:00:00 2001
From: Fazeel Nadeem <fazeel.bhatti@upstartcommerce.com>
Date: Wed, 19 Mar 2025 01:24:17 +0500
Subject: [PATCH 4/5] added back example env

---
 .env.example | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 .env.example

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..7215072
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,4 @@
+# .env.example
+
+# Required for scraping
+GOOGLE_API_KEY="Your Google API Key"
\ No newline at end of file

From f8140c5930f50c24b9b1c1fac5fbecb578022d88 Mon Sep 17 00:00:00 2001
From: Fazeel Nadeem <fazeel.bhatti@upstartcommerce.com>
Date: Wed, 19 Mar 2025 01:32:41 +0500
Subject: [PATCH 5/5] removed wrong requirement

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 7ba0f27..54dff39 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,3 @@ tqdm>=4.65.0
 outlines>=0.0.1
 chromadb>=0.4.18
 google-genai>=1.5.0
-geckodriver-autoinstaller>=0.1.0 
\ No newline at end of file