feat: Fix vector indexing stability, add Gitea linking, enhance admin dashboard
This commit is contained in:
112
app/cleanup_scan.py
Normal file
112
app/cleanup_scan.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add app to path
|
||||
sys.path.insert(0, 'app')
|
||||
from deps import get_db
|
||||
from pymysql.cursors import DictCursor
|
||||
|
||||
def scan_markdown_files(root_dir="docs"):
|
||||
"""Find all .md files in docs."""
|
||||
md_files = []
|
||||
for root, _, files in os.walk(root_dir):
|
||||
for file in files:
|
||||
if file.endswith(".md"):
|
||||
rel_path = os.path.relpath(os.path.join(root, file), root_dir)
|
||||
md_files.append(rel_path)
|
||||
return md_files
|
||||
|
||||
def get_indexed_slugs():
|
||||
"""Get all slugs from posts table."""
|
||||
conn = get_db()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT slug FROM posts")
|
||||
return {row['slug'] for row in cur.fetchall()}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def scan_secrets(root_dir="."):
|
||||
"""Scan for potential secrets."""
|
||||
patterns = {
|
||||
"OPENAI_API_KEY": r"sk-[a-zA-Z0-9]{20,}",
|
||||
"Generic API Key": r"(?i)api_key\s*=\s*['\"][a-zA-Z0-9-]{20,}['\"]",
|
||||
"Password": r"(?i)password\s*=\s*['\"][^'\"]{8,}['\"]",
|
||||
"Private Key": r"-----BEGIN PRIVATE KEY-----"
|
||||
}
|
||||
|
||||
findings = []
|
||||
exclude_dirs = {".git", "__pycache__", "venv", "node_modules", ".gemini"}
|
||||
|
||||
for root, dirs, files in os.walk(root_dir):
|
||||
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
||||
|
||||
for file in files:
|
||||
if file.endswith((".py", ".env", ".json", ".md", ".sh")):
|
||||
path = os.path.join(root, file)
|
||||
try:
|
||||
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
for name, pattern in patterns.items():
|
||||
if re.search(pattern, content):
|
||||
# Don't print the secret, just the finding
|
||||
findings.append(f"⚠️ Potential {name} found in {path}")
|
||||
except Exception:
|
||||
pass
|
||||
return findings
|
||||
|
||||
def main():
|
||||
print("🧹 Crumbforest Cleanup & Security Scan")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. Orphan Check
|
||||
print("\nPROBING: Unindexed .md Files...")
|
||||
md_files = scan_markdown_files()
|
||||
indexed_slugs = get_indexed_slugs()
|
||||
|
||||
orphans = []
|
||||
for md in md_files:
|
||||
# Construct slug from filename (e.g. 'crumbforest/intro.md' -> 'crumbforest/intro')
|
||||
slug_candidate = os.path.splitext(md)[0]
|
||||
# Also try prepending collection if nested? The slug logic depends on indexer.
|
||||
# Let's assume slug matches the relative path without extension for now.
|
||||
|
||||
# Checking if 'slug_candidate' exists in indexed_slugs.
|
||||
# But 'indexed_slugs' from verify earlier looked like 'rz-nullfeld/filename'.
|
||||
# Hmmm.
|
||||
if slug_candidate not in indexed_slugs:
|
||||
# Maybe the slug has a different prefix.
|
||||
# Let's check if the basename exists in ANY slug.
|
||||
basename = os.path.basename(slug_candidate)
|
||||
found = False
|
||||
for s in indexed_slugs:
|
||||
if s.endswith(basename):
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
orphans.append(md)
|
||||
|
||||
if orphans:
|
||||
print(f"found {len(orphans)} potentially unindexed files:")
|
||||
for o in orphans:
|
||||
print(f" - {o}")
|
||||
else:
|
||||
print("✅ No unindexed markdown files found (all seem covered by DB).")
|
||||
|
||||
# 2. Secret Scan
|
||||
print("\nPROBING: Secrets & Keys...")
|
||||
secrets = scan_secrets()
|
||||
if secrets:
|
||||
for s in secrets:
|
||||
print(s)
|
||||
else:
|
||||
print("✅ No obvious secrets found in code.")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Scan Complete.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user