From 4db1792b70bc6589092a2bd5e96e5cf3b6a95a04 Mon Sep 17 00:00:00 2001 From: Fazeel Nadeem Date: Wed, 19 Mar 2025 01:11:00 +0500 Subject: [PATCH 1/5] working gemini scrapper coded by cursor agent --- src/custom_prompt.txt | 51 +++++++++ src/doc_crawler.py | 130 ++++++++++++++++------- src/gemini_helper.py | 233 ++++++++++++++++++++++++++++++++++++++++++ src/main.py | 47 ++++++--- 4 files changed, 409 insertions(+), 52 deletions(-) create mode 100644 src/custom_prompt.txt create mode 100644 src/gemini_helper.py diff --git a/src/custom_prompt.txt b/src/custom_prompt.txt new file mode 100644 index 0000000..3ae430e --- /dev/null +++ b/src/custom_prompt.txt @@ -0,0 +1,51 @@ +You are an expert REST API documentation specialist. Format this Stoplight-hosted API documentation into clean, well-structured markdown. + +Focus on: +• RESTful endpoint details and HTTP methods +• Request/response structures and examples +• Authentication requirements +• Query parameters and path variables +• Request headers and body schemas +• Response codes and examples +• Rate limiting and pagination details + +Use these formatting rules: +• Group endpoints by resource type or service +• Use markdown tables for parameters, headers, and response fields +• Format JSON examples with proper syntax highlighting +• Use blockquotes for important notes, warnings, and prerequisites +• Include curl examples for each endpoint +• Preserve all data types and validation rules +• Maintain accurate status codes and descriptions +• Use consistent heading hierarchy: + - H1 for API resource/service name + - H2 for endpoint groups + - H3 for individual endpoints + - H4 for request/response sections + +Structure each endpoint as: +1. Endpoint Overview + - HTTP Method and Path + - Brief Description + - Authentication Requirements + +2. Request Details + - Headers + - Path Parameters + - Query Parameters + - Request Body Schema + - Example Request + +3. Response Details + - Success Response + - Error Responses + - Response Schema + - Example Response + +4. Usage Notes + - Rate Limits + - Pagination Details + - Special Considerations + - Related Endpoints + +Preserve all technical accuracy while making the documentation clear and developer-friendly. Format code examples as proper markdown code blocks with language specification. \ No newline at end of file diff --git a/src/doc_crawler.py b/src/doc_crawler.py index cf3e3ab..4c07317 100644 --- a/src/doc_crawler.py +++ b/src/doc_crawler.py @@ -2,15 +2,19 @@ import logging import asyncio from urllib.parse import urljoin, urlparse -from typing import Set, Dict +from typing import Set, Dict, Literal, Optional from gpt_helper import GPTHelper +from gemini_helper import GeminiHelper from selenium import webdriver -from selenium.webdriver.chrome.service import Service -from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService +from selenium.webdriver.firefox.service import Service as FirefoxService +from selenium.webdriver.chrome.options import Options as ChromeOptions +from selenium.webdriver.firefox.options import Options as FirefoxOptions from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager +#import geckodriver_autoinstaller import time logging.basicConfig(level=logging.INFO) @@ -21,42 +25,33 @@ class DocCrawler: def __init__( self, base_url: str, - output_dir: str = "coinbase_docs", - max_pages: int = 50, - max_concurrent_pages: int = 3 + output_dir: str, + concurrent_pages: int = 3, + model_type: str = "gpt", + browser_type: str = "chrome", + custom_prompt: Optional[str] = None, + max_pages: int = 1000 # Add default max pages limit ): self.base_url = base_url self.base_domain = urlparse(base_url).netloc self.output_dir = output_dir - self.max_pages = max_pages - self.max_concurrent_pages = max_concurrent_pages + self.concurrent_pages = concurrent_pages + self.model_type = model_type + self.browser_type = browser_type + self.custom_prompt = custom_prompt + self.max_pages = max_pages # Initialize max_pages self.visited_urls: Set[str] = set() self.processed_content: Dict[str, str] = {} - self.gpt_helper = GPTHelper() - self._page_semaphore = asyncio.Semaphore(max_concurrent_pages) - - # Initialize Selenium - chrome_options = Options() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("--disable-dev-shm-usage") - chrome_options.add_argument("--disable-gpu") - chrome_options.add_argument("--window-size=1920,1080") - chrome_options.add_argument("--ignore-certificate-errors") - chrome_options.add_argument("--disable-extensions") - chrome_options.add_argument("--disable-web-security") # Be careful with this in production - chrome_options.add_argument("--allow-running-insecure-content") - chrome_options.add_argument("--disable-blink-features=AutomationControlled") - chrome_options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") - - self.driver = webdriver.Chrome( - service=Service(ChromeDriverManager().install()), - options=chrome_options - ) - - # Set page load timeout - self.driver.set_page_load_timeout(90) - + self.url_queue = asyncio.Queue() + self.results = [] + + # Setup components + self._setup_logging() + self._setup_driver() + self._setup_ai_helper() + + self._page_semaphore = asyncio.Semaphore(concurrent_pages) + os.makedirs(output_dir, exist_ok=True) # Add progress tracking @@ -64,7 +59,9 @@ def __init__( logger.info(f"Initializing crawler for {base_url}") logger.info(f"Output directory: {output_dir}") logger.info(f"Maximum pages to crawl: {max_pages}") - logger.info(f"Maximum concurrent pages: {max_concurrent_pages}") + logger.info(f"Concurrent pages: {concurrent_pages}") + logger.info(f"Using AI model: {model_type}") + logger.info(f"Using browser: {browser_type}") def __del__(self): """Clean up Selenium driver""" @@ -227,7 +224,7 @@ async def process_page(self, url: str) -> None: # Process content try: logger.info("Sending to GPT for formatting...") - formatted_content = await self.gpt_helper.format_documentation(content) + formatted_content = await self.ai_helper.format_documentation(content) if formatted_content: logger.info(f"Content formatting successful ({len(formatted_content)} characters)") self.processed_content[url] = formatted_content @@ -301,7 +298,7 @@ async def crawl(self) -> None: # Process up to max_concurrent_pages pages in parallel current_batch = [] - while urls_to_visit and len(current_batch) < self.max_concurrent_pages: + while urls_to_visit and len(current_batch) < self.concurrent_pages: url = urls_to_visit.pop() if url not in self.visited_urls: current_batch.append(url) @@ -329,7 +326,7 @@ async def crawl(self) -> None: current_content = f.read() # Perform the final review - reviewed_content = await self.gpt_helper.final_review(current_content) + reviewed_content = await self.ai_helper.final_review(current_content) # Save the reviewed documentation with open(output_file, 'w', encoding='utf-8') as f: @@ -338,3 +335,62 @@ async def crawl(self) -> None: logger.info("Final documentation review completed!") logger.info("Crawl completed!") + + def _setup_ai_helper(self): + """Set up the appropriate AI helper based on model type.""" + if self.model_type == "gpt": + self.ai_helper = GPTHelper() + else: + self.ai_helper = GeminiHelper(custom_prompt=self.custom_prompt) + + def _setup_logging(self): + """Set up logging configuration.""" + self.logger = logging.getLogger(__name__) + self.logger.setLevel(logging.INFO) + + def _setup_driver(self): + """Set up the Selenium WebDriver.""" + # Initialize Selenium based on browser type + if self.browser_type == "chrome": + chrome_options = ChromeOptions() + chrome_options.add_argument("--headless") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument("--window-size=1920,1080") + chrome_options.add_argument("--ignore-certificate-errors") + chrome_options.add_argument("--disable-extensions") + chrome_options.add_argument("--disable-web-security") + chrome_options.add_argument("--allow-running-insecure-content") + chrome_options.add_argument("--disable-blink-features=AutomationControlled") + chrome_options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") + + self.driver = webdriver.Chrome( + service=ChromeService(ChromeDriverManager().install()), + options=chrome_options + ) + else: # firefox + # Install geckodriver if not present + #geckodriver_autoinstaller.install() + + firefox_options = FirefoxOptions() + firefox_options.add_argument("--headless") + firefox_options.add_argument("--width=1920") + firefox_options.add_argument("--height=1080") + firefox_options.add_argument("--disable-gpu") + firefox_options.add_argument("--no-sandbox") + firefox_options.add_argument("--disable-dev-shm-usage") + firefox_options.add_argument("--disable-extensions") + firefox_options.add_argument("--disable-web-security") + firefox_options.add_argument("--allow-running-insecure-content") + firefox_options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/115.0") + + self.driver = webdriver.Firefox( + service=FirefoxService(), + options=firefox_options + ) + + # Set page load timeout + self.driver.set_page_load_timeout(90) + + os.makedirs(self.output_dir, exist_ok=True) diff --git a/src/gemini_helper.py b/src/gemini_helper.py new file mode 100644 index 0000000..d697ce9 --- /dev/null +++ b/src/gemini_helper.py @@ -0,0 +1,233 @@ +import google.generativeai as genai +import os +import time +import logging +import asyncio +from dotenv import load_dotenv +from typing import Optional, List +from time import perf_counter + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +load_dotenv() + +class GeminiHelper: + def __init__(self, custom_prompt: Optional[str] = None): + self.api_key = os.getenv('GOOGLE_API_KEY') + if not self.api_key: + raise ValueError("GOOGLE_API_KEY not found in environment variables") + + # Store custom prompt if provided + self.custom_prompt = custom_prompt + if self.custom_prompt: + logger.info("Using custom prompt for documentation processing") + + # Configure the Gemini API + genai.configure(api_key=self.api_key) + + # Initialize the model + self.model = genai.GenerativeModel('gemini-2.0-flash') + self.chat = self.model.start_chat(history=[]) + + self.max_retries = 3 + self.chunk_size = 2000 + self.timeout = 90 + self.total_api_time = 0 + self.total_api_calls = 0 + self.max_concurrent_calls = 2 + self._semaphore = asyncio.Semaphore(self.max_concurrent_calls) + logger.info(f"GeminiHelper initialized (max {self.max_concurrent_calls} concurrent calls)") + + async def _call_gemini(self, content: str, retries: int = 0, system_message: str = None) -> Optional[str]: + """Make a single Gemini API call with retry logic.""" + async with self._semaphore: + try: + logger.info(f"Making Gemini API call (attempt {retries + 1})") + start_time = perf_counter() + + # Use custom prompt if available, otherwise use default system message + if self.custom_prompt: + system_message = self.custom_prompt + elif system_message is None: + system_message = """You are an expert Apple framework documentation engineer. Format this documentation chunk into clean markdown. +Focus on: +• Framework overview and concepts +• Types, protocols, and class hierarchies +• Method and property declarations +• Code examples and usage patterns +• Best practices and implementation guidelines + +Use these formatting rules: +• Use Apple-style hierarchical headings +• Format Swift code blocks with proper syntax highlighting +• Use tables for parameter and return value descriptions +• Use blockquotes for important notes and warnings +• Preserve all declaration syntax and type information +• Keep working code examples +• Maintain Apple's technical accuracy and terminology +• Include relevant privacy and entitlement requirements +• Preserve framework version and availability information +• Format symbol references with proper linking syntax + +Structure sections as: +1. Overview/Introduction +2. Topics +3. Declarations +4. Discussion +5. Parameters/Return Value +6. See Also/Related""" + + try: + # Start a new chat for each call to ensure clean context + chat = self.model.start_chat(history=[]) + + # Send system message first + await asyncio.to_thread(chat.send_message, system_message) + + # Send the actual content + response = await asyncio.to_thread(chat.send_message, content) + + end_time = perf_counter() + duration = end_time - start_time + self.total_api_time += duration + self.total_api_calls += 1 + avg_time = self.total_api_time / self.total_api_calls + + logger.info(f"Gemini API call successful - Took {duration:.2f}s (Avg: {avg_time:.2f}s)") + return response.text + + except Exception as e: + logger.error(f"Gemini API Error: {str(e)}") + raise + + except Exception as e: + logger.error(f"Error in Gemini call: {str(e)}") + if retries < self.max_retries: + await asyncio.sleep(2 ** retries) + return await self._call_gemini(content, retries + 1, system_message) + return f"Error processing chunk: {str(e)}" + + def _split_into_chunks(self, text: str) -> list[str]: + """Split text into processable chunks while preserving markdown structure.""" + chunks = [] + current_chunk = [] + current_size = 0 + + blocks = text.split('\n\n') + logger.info(f"Splitting content into chunks (total blocks: {len(blocks)})") + + for block in blocks: + block_size = len(block) + + if current_size + block_size > self.chunk_size: + if current_chunk: + chunks.append('\n\n'.join(current_chunk)) + current_chunk = [block] + current_size = block_size + else: + current_chunk.append(block) + current_size += block_size + + if current_chunk: + chunks.append('\n\n'.join(current_chunk)) + + logger.info(f"Created {len(chunks)} chunks") + return chunks + + async def format_documentation(self, content: str) -> str: + """Process documentation chunks in parallel and combine results.""" + try: + start_time = perf_counter() + logger.info("Starting documentation formatting") + chunks = self._split_into_chunks(content) + + # Process all chunks in parallel + tasks = [self._call_gemini(chunk) for chunk in chunks] + formatted_chunks = await asyncio.gather(*tasks) + formatted_chunks = [chunk for chunk in formatted_chunks if chunk] + + logger.info("Combining chunks") + combined = '\n\n---\n\n'.join(formatted_chunks) + + end_time = perf_counter() + total_duration = end_time - start_time + logger.info(f"Documentation formatting completed - Total time: {total_duration:.2f}s, API calls: {self.total_api_calls}, Avg API time: {self.total_api_time/self.total_api_calls:.2f}s") + return combined + + except Exception as e: + logger.error(f"Error in format_documentation: {str(e)}") + return f"Error formatting documentation: {str(e)}" + + async def final_review(self, content: str) -> str: + """Perform a final review of the entire documentation.""" + try: + logger.info("Starting final documentation review") + start_time = perf_counter() + + # Split into larger chunks for final review since we're just cleaning up + original_chunk_size = self.chunk_size + self.chunk_size = 4000 # Temporarily increase chunk size + chunks = self._split_into_chunks(content) + self.chunk_size = original_chunk_size + + review_tasks = [] + for chunk in chunks: + task = self._call_gemini( + chunk, + system_message="""You are an expert technical documentation reviewer. Review and improve this API documentation chunk. +Focus on: +1. Removing any duplicate content +2. Ensuring consistent formatting and style +3. Making the documentation clear and readable +4. Proper markdown formatting +5. Consistent heading hierarchy +6. Proper section breaks +7. Complete and accurate endpoint documentation +8. Consistent use of code blocks and tables +9. Clear parameter descriptions +10. Proper grouping of related endpoints + +Keep all valid API endpoint information but make it more concise and well-organized.""" + ) + review_tasks.append(task) + + # Process review chunks in parallel + reviewed_chunks = await asyncio.gather(*review_tasks) + reviewed_chunks = [chunk for chunk in reviewed_chunks if chunk] + + # Combine reviewed chunks + logger.info("Combining reviewed chunks") + combined = '\n\n'.join(reviewed_chunks) + + # Final pass to ensure consistency across the entire document + logger.info("Making final consistency pass") + final_content = await self._call_gemini( + combined, + system_message="""You are an expert technical documentation editor. This is the final pass of the API documentation. +Your task is to ensure the entire document is consistent and well-organized. + +Focus on: +1. Consistent structure throughout the document +2. Clear and logical organization of endpoints +3. Proper table of contents +4. Consistent heading levels +5. Remove any remaining duplicates +6. Ensure all cross-references are valid +7. Consistent formatting of endpoints, parameters, and examples +8. Group related endpoints together +9. Add clear section dividers +10. Ensure all API information is accurate and complete + +Maintain all API endpoint information but make it as clear and well-organized as possible.""" + ) + + end_time = perf_counter() + total_duration = end_time - start_time + logger.info(f"Final review completed - Total time: {total_duration:.2f}s") + + return final_content or combined + + except Exception as e: + logger.error(f"Error in final review: {str(e)}") + return content # Return original content if review fails \ No newline at end of file diff --git a/src/main.py b/src/main.py index ffdfe0e..7a92b36 100644 --- a/src/main.py +++ b/src/main.py @@ -1,24 +1,41 @@ -from doc_crawler import DocCrawler -import asyncio import argparse +import asyncio +import os +from doc_crawler import DocCrawler -async def main(): - parser = argparse.ArgumentParser(description='Scrape documentation from a website') - parser.add_argument('url', help='The base URL to scrape') - parser.add_argument('--output', '-o', default='output_docs', help='Output directory for docs') - parser.add_argument('--max-pages', '-m', type=int, default=1000, help='Maximum pages to scrape') - parser.add_argument('--concurrent', '-c', type=int, default=1, help='Number of concurrent pages to scrape') +def main(): + parser = argparse.ArgumentParser(description='Crawl and process documentation using AI') + parser.add_argument('url', help='The base URL to start crawling from') + parser.add_argument('--output', default='docs', help='Output directory for processed documentation') + parser.add_argument('--concurrent', type=int, default=3, help='Number of concurrent pages to scrape') + parser.add_argument('--model', choices=['gpt', 'gemini'], default='gpt', help='AI model to use for processing') + parser.add_argument('--browser', choices=['chrome', 'firefox'], default='chrome', help='Browser to use for scraping') + parser.add_argument('--custom-prompt', help='Path to a text file containing custom prompt for the AI model') + parser.add_argument('--max-pages', type=int, default=1000, help='Maximum number of pages to crawl') args = parser.parse_args() - + + # Read custom prompt from file if specified + custom_prompt = None + if args.custom_prompt: + try: + with open(args.custom_prompt, 'r', encoding='utf-8') as f: + custom_prompt = f.read().strip() + except Exception as e: + print(f"Error reading custom prompt file: {str(e)}") + return + crawler = DocCrawler( base_url=args.url, output_dir=args.output, - max_pages=args.max_pages, - max_concurrent_pages=args.concurrent + concurrent_pages=args.concurrent, + model_type=args.model, + browser_type=args.browser, + custom_prompt=custom_prompt, + max_pages=args.max_pages ) + + asyncio.run(crawler.crawl()) - await crawler.crawl() - -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file +if __name__ == '__main__': + main() \ No newline at end of file From b24427982cab8eb233007392e41ac69ce601e04d Mon Sep 17 00:00:00 2001 From: Fazeel Nadeem Date: Wed, 19 Mar 2025 01:11:33 +0500 Subject: [PATCH 2/5] updated readme, requirements --- .env.example | 4 ---- README.md | 19 +++++++++++++++---- requirements.txt | 4 +++- 3 files changed, 18 insertions(+), 9 deletions(-) delete mode 100644 .env.example diff --git a/.env.example b/.env.example deleted file mode 100644 index cca9614..0000000 --- a/.env.example +++ /dev/null @@ -1,4 +0,0 @@ -# .env.example - -# Required for scraping -OPENAI_API_KEY=YOUR_OPENAI_API_KEY \ No newline at end of file diff --git a/README.md b/README.md index 293b69a..b46d0d3 100644 --- a/README.md +++ b/README.md @@ -22,12 +22,19 @@ The `-e` flag installs the package in "editable" mode, which means: ### Environment Setup -Create a `.env` file in the project root: +Create a `.env` file in the project root with your API key(s): + +For OpenAI GPT: +```bash +OPENAI_API_KEY=your_openai_api_key_here +``` + +For Google Gemini: ```bash -OPENAI_API_KEY=your_api_key_here +GOOGLE_API_KEY=your_google_api_key_here ``` -⚠️ The OpenAI API key is required for the crawler to process documentation. +⚠️ At least one API key is required for the crawler to process documentation. ## Usage @@ -43,10 +50,11 @@ python main.py https://docs.example.com - `-o, --output`: Output directory (default: output_docs) - `-m, --max-pages`: Maximum pages to scrape (default: 1000) - `-c, --concurrent`: Number of concurrent pages to scrape (default: 1) +- `--model`: AI model to use for processing (default: "gpt", options: "gpt" or "gemini") Example with all options: ```bash -python main.py https://docs.example.com -o my_docs -m 500 -c 2 +python main.py https://docs.example.com -o my_docs -m 500 -c 2 --model gemini ``` ### Troubleshooting @@ -63,8 +71,11 @@ The crawler accepts the following parameters: - `output_dir`: Directory where scraped docs will be saved - `max_pages`: Maximum number of pages to crawl - `max_concurrent_pages`: Number of concurrent pages to process +- `model_type`: AI model to use ("gpt" or "gemini") ## Requirements - Python 3.8+ - Chrome/Chromium browser (for Selenium) +- OpenAI API key (for GPT model) +- Google API key (for Gemini model) diff --git a/requirements.txt b/requirements.txt index 704f679..839cf8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,6 @@ python-dotenv>=0.19.0 pyyaml==6.0.1 tqdm>=4.65.0 outlines>=0.0.1 -chromadb>=0.4.18 \ No newline at end of file +chromadb>=0.4.18 +google-generativeai>=0.3.0 +geckodriver-autoinstaller>=0.1.0 \ No newline at end of file From 854588d30bbd0d5fd22d47351cb2c7deefcd2918 Mon Sep 17 00:00:00 2001 From: Fazeel Nadeem Date: Wed, 19 Mar 2025 01:13:51 +0500 Subject: [PATCH 3/5] added google-genai to requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 839cf8d..7ba0f27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,5 @@ pyyaml==6.0.1 tqdm>=4.65.0 outlines>=0.0.1 chromadb>=0.4.18 -google-generativeai>=0.3.0 +google-genai>=1.5.0 geckodriver-autoinstaller>=0.1.0 \ No newline at end of file From 454df320fc79f2772210fc225666dc0d8be81a66 Mon Sep 17 00:00:00 2001 From: Fazeel Nadeem Date: Wed, 19 Mar 2025 01:24:17 +0500 Subject: [PATCH 4/5] added back example env --- .env.example | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..7215072 --- /dev/null +++ b/.env.example @@ -0,0 +1,4 @@ +# .env.example + +# Required for scraping +GOOGLE_API_KEY="Your Google API Key" \ No newline at end of file From f8140c5930f50c24b9b1c1fac5fbecb578022d88 Mon Sep 17 00:00:00 2001 From: Fazeel Nadeem Date: Wed, 19 Mar 2025 01:32:41 +0500 Subject: [PATCH 5/5] removed wrong requirement --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7ba0f27..54dff39 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,3 @@ tqdm>=4.65.0 outlines>=0.0.1 chromadb>=0.4.18 google-genai>=1.5.0 -geckodriver-autoinstaller>=0.1.0 \ No newline at end of file