Crumb-Core-v.1/app/cleanup_scan.py

import os
import re
import sys
from pathlib import Path

# Add app to path
sys.path.insert(0, 'app')
from deps import get_db
from pymysql.cursors import DictCursor

def scan_markdown_files(root_dir="docs"):
    """Find all .md files in docs."""
    md_files = []
    for root, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".md"):
                rel_path = os.path.relpath(os.path.join(root, file), root_dir)
                md_files.append(rel_path)
    return md_files

def get_indexed_slugs():
    """Get all slugs from posts table."""
    conn = get_db()
    try:
        with conn.cursor() as cur:
            cur.execute("SELECT slug FROM posts")
            return {row['slug'] for row in cur.fetchall()}
    finally:
        conn.close()

def scan_secrets(root_dir="."):
    """Scan for potential secrets."""
    patterns = {
        "OPENAI_API_KEY": r"sk-[a-zA-Z0-9]{20,}",
        "Generic API Key": r"(?i)api_key\s*=\s*['\"][a-zA-Z0-9-]{20,}['\"]",
        "Password": r"(?i)password\s*=\s*['\"][^'\"]{8,}['\"]",
        "Private Key": r"-----BEGIN PRIVATE KEY-----"
    }

    findings = []
    exclude_dirs = {".git", "__pycache__", "venv", "node_modules", ".gemini"}

    for root, dirs, files in os.walk(root_dir):
        dirs[:] = [d for d in dirs if d not in exclude_dirs]

        for file in files:
            if file.endswith((".py", ".env", ".json", ".md", ".sh")):
                path = os.path.join(root, file)
                try:
                    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()

                        for name, pattern in patterns.items():
                            if re.search(pattern, content):
                                # Don't print the secret, just the finding
                                findings.append(f"⚠️  Potential {name} found in {path}")
                except Exception:
                    pass
    return findings

def main():
    print("🧹 Crumbforest Cleanup & Security Scan")
    print("=" * 60)

    # 1. Orphan Check
    print("\nPROBING: Unindexed .md Files...")
    md_files = scan_markdown_files()
    indexed_slugs = get_indexed_slugs()

    orphans = []
    for md in md_files:
        # Construct slug from filename (e.g. 'crumbforest/intro.md' -> 'crumbforest/intro')
        slug_candidate = os.path.splitext(md)[0]
        # Also try prepending collection if nested? The slug logic depends on indexer.
        # Let's assume slug matches the relative path without extension for now.

        # Checking if 'slug_candidate' exists in indexed_slugs.
        # But 'indexed_slugs' from verify earlier looked like 'rz-nullfeld/filename'.
        # Hmmm.
        if slug_candidate not in indexed_slugs:
            # Maybe the slug has a different prefix.
            # Let's check if the basename exists in ANY slug.
            basename = os.path.basename(slug_candidate)
            found = False
            for s in indexed_slugs:
                if s.endswith(basename):
                    found = True
                    break
            if not found:
                orphans.append(md)

    if orphans:
        print(f"found {len(orphans)} potentially unindexed files:")
        for o in orphans:
            print(f" - {o}")
    else:
        print("✅ No unindexed markdown files found (all seem covered by DB).")

    # 2. Secret Scan
    print("\nPROBING: Secrets & Keys...")
    secrets = scan_secrets()
    if secrets:
        for s in secrets:
            print(s)
    else:
        print("✅ No obvious secrets found in code.")

    print("\n" + "="*60)
    print("Scan Complete.")

if __name__ == "__main__":
    main()