import os import re import sys from pathlib import Path # Add app to path sys.path.insert(0, 'app') from deps import get_db from pymysql.cursors import DictCursor def scan_markdown_files(root_dir="docs"): """Find all .md files in docs.""" md_files = [] for root, _, files in os.walk(root_dir): for file in files: if file.endswith(".md"): rel_path = os.path.relpath(os.path.join(root, file), root_dir) md_files.append(rel_path) return md_files def get_indexed_slugs(): """Get all slugs from posts table.""" conn = get_db() try: with conn.cursor() as cur: cur.execute("SELECT slug FROM posts") return {row['slug'] for row in cur.fetchall()} finally: conn.close() def scan_secrets(root_dir="."): """Scan for potential secrets.""" patterns = { "OPENAI_API_KEY": r"sk-[a-zA-Z0-9]{20,}", "Generic API Key": r"(?i)api_key\s*=\s*['\"][a-zA-Z0-9-]{20,}['\"]", "Password": r"(?i)password\s*=\s*['\"][^'\"]{8,}['\"]", "Private Key": r"-----BEGIN PRIVATE KEY-----" } findings = [] exclude_dirs = {".git", "__pycache__", "venv", "node_modules", ".gemini"} for root, dirs, files in os.walk(root_dir): dirs[:] = [d for d in dirs if d not in exclude_dirs] for file in files: if file.endswith((".py", ".env", ".json", ".md", ".sh")): path = os.path.join(root, file) try: with open(path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() for name, pattern in patterns.items(): if re.search(pattern, content): # Don't print the secret, just the finding findings.append(f"⚠️ Potential {name} found in {path}") except Exception: pass return findings def main(): print("🧹 Crumbforest Cleanup & Security Scan") print("=" * 60) # 1. Orphan Check print("\nPROBING: Unindexed .md Files...") md_files = scan_markdown_files() indexed_slugs = get_indexed_slugs() orphans = [] for md in md_files: # Construct slug from filename (e.g. 'crumbforest/intro.md' -> 'crumbforest/intro') slug_candidate = os.path.splitext(md)[0] # Also try prepending collection if nested? The slug logic depends on indexer. # Let's assume slug matches the relative path without extension for now. # Checking if 'slug_candidate' exists in indexed_slugs. # But 'indexed_slugs' from verify earlier looked like 'rz-nullfeld/filename'. # Hmmm. if slug_candidate not in indexed_slugs: # Maybe the slug has a different prefix. # Let's check if the basename exists in ANY slug. basename = os.path.basename(slug_candidate) found = False for s in indexed_slugs: if s.endswith(basename): found = True break if not found: orphans.append(md) if orphans: print(f"found {len(orphans)} potentially unindexed files:") for o in orphans: print(f" - {o}") else: print("✅ No unindexed markdown files found (all seem covered by DB).") # 2. Secret Scan print("\nPROBING: Secrets & Keys...") secrets = scan_secrets() if secrets: for s in secrets: print(s) else: print("✅ No obvious secrets found in code.") print("\n" + "="*60) print("Scan Complete.") if __name__ == "__main__": main()