113 lines
3.7 KiB
Python
113 lines
3.7 KiB
Python
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add app to path
|
|
sys.path.insert(0, 'app')
|
|
from deps import get_db
|
|
from pymysql.cursors import DictCursor
|
|
|
|
def scan_markdown_files(root_dir="docs"):
|
|
"""Find all .md files in docs."""
|
|
md_files = []
|
|
for root, _, files in os.walk(root_dir):
|
|
for file in files:
|
|
if file.endswith(".md"):
|
|
rel_path = os.path.relpath(os.path.join(root, file), root_dir)
|
|
md_files.append(rel_path)
|
|
return md_files
|
|
|
|
def get_indexed_slugs():
|
|
"""Get all slugs from posts table."""
|
|
conn = get_db()
|
|
try:
|
|
with conn.cursor() as cur:
|
|
cur.execute("SELECT slug FROM posts")
|
|
return {row['slug'] for row in cur.fetchall()}
|
|
finally:
|
|
conn.close()
|
|
|
|
def scan_secrets(root_dir="."):
|
|
"""Scan for potential secrets."""
|
|
patterns = {
|
|
"OPENAI_API_KEY": r"sk-[a-zA-Z0-9]{20,}",
|
|
"Generic API Key": r"(?i)api_key\s*=\s*['\"][a-zA-Z0-9-]{20,}['\"]",
|
|
"Password": r"(?i)password\s*=\s*['\"][^'\"]{8,}['\"]",
|
|
"Private Key": r"-----BEGIN PRIVATE KEY-----"
|
|
}
|
|
|
|
findings = []
|
|
exclude_dirs = {".git", "__pycache__", "venv", "node_modules", ".gemini"}
|
|
|
|
for root, dirs, files in os.walk(root_dir):
|
|
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
|
|
|
for file in files:
|
|
if file.endswith((".py", ".env", ".json", ".md", ".sh")):
|
|
path = os.path.join(root, file)
|
|
try:
|
|
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
|
|
for name, pattern in patterns.items():
|
|
if re.search(pattern, content):
|
|
# Don't print the secret, just the finding
|
|
findings.append(f"⚠️ Potential {name} found in {path}")
|
|
except Exception:
|
|
pass
|
|
return findings
|
|
|
|
def main():
|
|
print("🧹 Crumbforest Cleanup & Security Scan")
|
|
print("=" * 60)
|
|
|
|
# 1. Orphan Check
|
|
print("\nPROBING: Unindexed .md Files...")
|
|
md_files = scan_markdown_files()
|
|
indexed_slugs = get_indexed_slugs()
|
|
|
|
orphans = []
|
|
for md in md_files:
|
|
# Construct slug from filename (e.g. 'crumbforest/intro.md' -> 'crumbforest/intro')
|
|
slug_candidate = os.path.splitext(md)[0]
|
|
# Also try prepending collection if nested? The slug logic depends on indexer.
|
|
# Let's assume slug matches the relative path without extension for now.
|
|
|
|
# Checking if 'slug_candidate' exists in indexed_slugs.
|
|
# But 'indexed_slugs' from verify earlier looked like 'rz-nullfeld/filename'.
|
|
# Hmmm.
|
|
if slug_candidate not in indexed_slugs:
|
|
# Maybe the slug has a different prefix.
|
|
# Let's check if the basename exists in ANY slug.
|
|
basename = os.path.basename(slug_candidate)
|
|
found = False
|
|
for s in indexed_slugs:
|
|
if s.endswith(basename):
|
|
found = True
|
|
break
|
|
if not found:
|
|
orphans.append(md)
|
|
|
|
if orphans:
|
|
print(f"found {len(orphans)} potentially unindexed files:")
|
|
for o in orphans:
|
|
print(f" - {o}")
|
|
else:
|
|
print("✅ No unindexed markdown files found (all seem covered by DB).")
|
|
|
|
# 2. Secret Scan
|
|
print("\nPROBING: Secrets & Keys...")
|
|
secrets = scan_secrets()
|
|
if secrets:
|
|
for s in secrets:
|
|
print(s)
|
|
else:
|
|
print("✅ No obvious secrets found in code.")
|
|
|
|
print("\n" + "="*60)
|
|
print("Scan Complete.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|