How OpenGround extracts documentation from git repositories, sitemaps, and local directories
OpenGround can ingest documentation from three source types: git repositories, sitemaps, and local directories. Each source type has a dedicated extractor that produces standardized ParsedPage objects for downstream processing.
def resolve_remote_ref(repo_url: str, version: str) -> str | None: """Check if a ref (tag or branch) exists on the remote. Handles 'v' prefix variants for tags.""" # Get all remote refs result = subprocess.run( ["git", "ls-remote", "--refs", repo_url], capture_output=True, text=True ) # Check exact match if version in remote_refs: return version # Check variants (v1.0.0 ↔ 1.0.0) if version.startswith("v"): variants = [version[1:]] # Try without 'v' else: variants = [f"v{version}"] # Try with 'v' for variant in variants: if variant in remote_refs: return variant
You can use v1.0.0 or 1.0.0 - OpenGround will find the correct tag automatically.
The sitemap extractor (extract/sitemap.py) fetches and processes web pages:
1
Fetch Sitemap
Copy
# From extract/sitemap.py:23-46async with session.get(url) as response: content = await response.text()root = ET.fromstring(content)namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"}# Extract all <loc> URLsurls = { loc.text for loc in root.findall(".//ns:loc", namespaces=namespace) if loc.text}# Filter by keywords (case-insensitive)if keywords: urls = {u for u in urls if any(k in u.lower() for k in keywords)}
2
Check robots.txt
Copy
# From extract/sitemap.py:50-78robot_parser = await fetch_robots_txt(session, base_url)allowed_urls = { url for url in urls if robot_parser.can_fetch("*", url)}
OpenGround respects robots.txt and only crawls allowed URLs.
3
Process Pages Concurrently
Copy
# From extract/sitemap.py:207-221semaphore = asyncio.Semaphore(concurrency_limit) # Default: 50tasks = [ process_url(semaphore, session, url, library_name, version) for url in urls]results = await asyncio.gather(*tasks)
Downloads and processes up to 50 pages concurrently for speed.
Trafilatura extracts clean content, removing navigation, ads, etc.
Some sites use client-side rendering (React, Vue, Next.js) which requires JavaScript. OpenGround will detect this and skip those pages. Use the git source type instead for such documentation.
From extract/sitemap.py:141-156, OpenGround warns about JS-required pages:
Copy
if not content: js_indicators = [ "BAILOUT_TO_CLIENT_SIDE_RENDERING", "_next/static", 'id="root"', 'id="app"', 'id="__next"', "You need to enable JavaScript", ] if any(indicator in html for indicator in js_indicators): print(f"Warning: Page likely requires JavaScript: {url}")
The local path extractor (extract/local_path.py) is the simplest:
Copy
# From extract/local_path.py:18-70async def extract_local_path( local_path: Path, output_dir: Path, library_name: str, version: str,) -> None: # Expand ~ and resolve to absolute path local_path = local_path.expanduser().resolve() # Validate path exists and is a directory if not local_path.exists(): error(f"Path does not exist: {local_path}") return # Find all documentation files doc_files = filter_documentation_files(local_path) # Process files and save results = await process_documentation_files( doc_files=doc_files, url_generator=lambda p: f"file://{p}", library_name=library_name, version=version, default_description=f"Documentation file from {local_path}", base_path=local_path, ) await save_results(results, output_dir)
Local paths use file:// URLs for references. Perfect for work-in-progress documentation or private codebases.
From extract/source.py:58-89, OpenGround automatically saves sources:
Copy
def save_source_to_sources(library_name: str, config: LibrarySource) -> None: """Save to both project-local and user sources files.""" # Save to .openground/sources.json (project-local) _save_to_file(PROJECT_SOURCE_FILE) # Save to ~/.openground/sources.json (user) _save_to_file(USER_SOURCE_FILE)
# Title from front matter or filenametitle = metadata.get("title") or \ file_path.stem.replace("-", " ").title()# Description from metadata or pathdescription = metadata.get("description") or \ f"Documentation file from {relative_path}"
Extracted pages are saved as JSON files before embedding (from extract/common.py:230-258):
Copy
async def save_results(results: list[ParsedPage], output_dir: Path): # Clear existing files if output_dir.exists(): for item in output_dir.iterdir(): item.unlink() if item.is_file() else shutil.rmtree(item) output_dir.mkdir(parents=True, exist_ok=True) # Save each page as JSON for result in results: slug = urlparse(result["url"]).path.strip("/").replace("/", "-") or "home" file_name = output_dir / f"{slug}.json" with open(file_name, "w", encoding="utf-8") as f: json.dump(result, f, indent=2)