Files
Crumb-Core-v.1/app/cleanup_scan.py

113 lines
3.7 KiB
Python

import os
import re
import sys
from pathlib import Path
# Add app to path
sys.path.insert(0, 'app')
from deps import get_db
from pymysql.cursors import DictCursor
def scan_markdown_files(root_dir="docs"):
"""Find all .md files in docs."""
md_files = []
for root, _, files in os.walk(root_dir):
for file in files:
if file.endswith(".md"):
rel_path = os.path.relpath(os.path.join(root, file), root_dir)
md_files.append(rel_path)
return md_files
def get_indexed_slugs():
"""Get all slugs from posts table."""
conn = get_db()
try:
with conn.cursor() as cur:
cur.execute("SELECT slug FROM posts")
return {row['slug'] for row in cur.fetchall()}
finally:
conn.close()
def scan_secrets(root_dir="."):
"""Scan for potential secrets."""
patterns = {
"OPENAI_API_KEY": r"sk-[a-zA-Z0-9]{20,}",
"Generic API Key": r"(?i)api_key\s*=\s*['\"][a-zA-Z0-9-]{20,}['\"]",
"Password": r"(?i)password\s*=\s*['\"][^'\"]{8,}['\"]",
"Private Key": r"-----BEGIN PRIVATE KEY-----"
}
findings = []
exclude_dirs = {".git", "__pycache__", "venv", "node_modules", ".gemini"}
for root, dirs, files in os.walk(root_dir):
dirs[:] = [d for d in dirs if d not in exclude_dirs]
for file in files:
if file.endswith((".py", ".env", ".json", ".md", ".sh")):
path = os.path.join(root, file)
try:
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
for name, pattern in patterns.items():
if re.search(pattern, content):
# Don't print the secret, just the finding
findings.append(f"⚠️ Potential {name} found in {path}")
except Exception:
pass
return findings
def main():
print("🧹 Crumbforest Cleanup & Security Scan")
print("=" * 60)
# 1. Orphan Check
print("\nPROBING: Unindexed .md Files...")
md_files = scan_markdown_files()
indexed_slugs = get_indexed_slugs()
orphans = []
for md in md_files:
# Construct slug from filename (e.g. 'crumbforest/intro.md' -> 'crumbforest/intro')
slug_candidate = os.path.splitext(md)[0]
# Also try prepending collection if nested? The slug logic depends on indexer.
# Let's assume slug matches the relative path without extension for now.
# Checking if 'slug_candidate' exists in indexed_slugs.
# But 'indexed_slugs' from verify earlier looked like 'rz-nullfeld/filename'.
# Hmmm.
if slug_candidate not in indexed_slugs:
# Maybe the slug has a different prefix.
# Let's check if the basename exists in ANY slug.
basename = os.path.basename(slug_candidate)
found = False
for s in indexed_slugs:
if s.endswith(basename):
found = True
break
if not found:
orphans.append(md)
if orphans:
print(f"found {len(orphans)} potentially unindexed files:")
for o in orphans:
print(f" - {o}")
else:
print("✅ No unindexed markdown files found (all seem covered by DB).")
# 2. Secret Scan
print("\nPROBING: Secrets & Keys...")
secrets = scan_secrets()
if secrets:
for s in secrets:
print(s)
else:
print("✅ No obvious secrets found in code.")
print("\n" + "="*60)
print("Scan Complete.")
if __name__ == "__main__":
main()