This commit is contained in:
cash
2026-03-29 23:50:49 -05:00
commit eb5e194331
56 changed files with 4010 additions and 0 deletions

320
backend/web/db_extra.py Normal file
View File

@@ -0,0 +1,320 @@
"""
backend/web/db_extra.py - 16 May 2025
"""
from __future__ import annotations
import asyncio
import datetime as dt
from datetime import timezone
import structlog
from typing import List
from sqlalchemy import (
Table, Column, Text, Float, Integer, Boolean, DateTime, func,
select, insert, update, delete, or_, inspect, text
)
from sqlalchemy.dialects.postgresql import insert as pg_insert
from backend.core.db import SessionLocal, metadata, engine
from backend.core.settings import (
MAX_IN_USE, FAIL_COOLDOWN_SEC, MIN_SCORE, PROXY_LIST_FILE,
_WINDOW_MINUTES, PROXY_USERNAME, PROXY_PASSWORD,
_MAX_LOGIN_FAILS, _MAX_INVALID_URLS
)
log = structlog.get_logger()
_IS_PG = engine.url.get_backend_name().startswith("postgres")
def _insert_ignore(tbl: Table, **vals):
if _IS_PG:
return pg_insert(tbl).values(**vals).on_conflict_do_nothing()
return insert(tbl).prefix_with("OR IGNORE").values(**vals)
def _clamp_zero(expr):
"""SQLportable max(expr, 0)."""
return func.greatest(expr, 0) if _IS_PG else func.max(expr, 0)
proxy_tbl = Table(
"proxies", metadata,
Column("url", Text, primary_key=True),
Column("score", Float, nullable=False, server_default="1.0"),
Column("fails", Integer, nullable=False, server_default="0"),
Column("banned", Boolean, nullable=False, server_default="false"),
Column("in_use", Integer, nullable=False, server_default="0"),
Column("last_fail", DateTime),
Column("updated_at", DateTime, server_default=func.now(), index=True),
)
login_tbl = Table(
"login_attempts", metadata,
Column("ip", Text, primary_key=True),
Column("count", Integer, nullable=False, server_default="0"),
Column("updated_at", DateTime, nullable=False, server_default=func.now()),
)
invalid_tbl = Table(
"invalid_urls", metadata,
Column("ip", Text, primary_key=True),
Column("count", Integer, nullable=False, server_default="0"),
Column("updated_at", DateTime, nullable=False, server_default=func.now()),
)
dl_stats = Table(
"dl_stats", metadata,
Column("id", Integer, primary_key=True, autoincrement=True),
Column("ok", Boolean, nullable=False),
Column("ts", DateTime, nullable=False, server_default=func.now(), index=True),
)
def _ensure_proxy_columns() -> None:
insp = inspect(engine)
if "proxies" not in insp.get_table_names():
return
existing = {c["name"] for c in insp.get_columns("proxies")}
add: list[tuple[str, str]] = []
if "in_use" not in existing: add.append(("in_use", "INTEGER DEFAULT 0"))
if "last_fail" not in existing: add.append(("last_fail", "TIMESTAMP"))
if not add:
return
with engine.begin() as conn:
for col, ddl in add:
if _IS_PG:
conn.execute(text(f"ALTER TABLE proxies ADD COLUMN IF NOT EXISTS {col} {ddl};"))
else:
conn.execute(text(f"ALTER TABLE proxies ADD COLUMN {col} {ddl};"))
log.info("proxy.schema.auto_migrated", added=[c for c, _ in add])
#metadata.create_all(engine)
_ensure_proxy_columns()
def _seed() -> None:
if not PROXY_LIST_FILE.exists():
return
with SessionLocal.begin() as s:
for ln in PROXY_LIST_FILE.read_text().splitlines():
ln = ln.strip()
if not ln:
continue
ip, port = ln.split(":", 1)
px = (
f"http://{PROXY_USERNAME}:{PROXY_PASSWORD}@{ip}:{port}"
if PROXY_USERNAME else f"http://{ip}:{port}"
)
s.execute(_insert_ignore(proxy_tbl, url=px))
def _candidate_stmt(now: dt.datetime):
cool_ts = now - dt.timedelta(seconds=FAIL_COOLDOWN_SEC)
jitter = func.random() * 0.01
return (
select(proxy_tbl.c.url)
.where(
proxy_tbl.c.banned.is_(False),
proxy_tbl.c.score > MIN_SCORE,
proxy_tbl.c.in_use < MAX_IN_USE,
or_(proxy_tbl.c.last_fail.is_(None), proxy_tbl.c.last_fail < cool_ts),
)
.order_by((proxy_tbl.c.score + jitter).desc())
.limit(1)
.with_for_update(nowait=False)
)
def acquire_proxy() -> str | None:
now = dt.datetime.now(timezone.utc)
with SessionLocal.begin() as s:
row = s.execute(_candidate_stmt(now)).first()
if not row:
return None
px = row[0]
s.execute(
update(proxy_tbl)
.where(proxy_tbl.c.url == px)
.values(in_use=proxy_tbl.c.in_use + 1, updated_at=now)
)
return px
def release_proxy(px: str, ok: bool) -> None:
if not px or px == "DIRECT":
return
now = dt.datetime.now(timezone.utc)
with SessionLocal.begin() as s:
new_in_use = proxy_tbl.c.in_use - 1
s.execute(
update(proxy_tbl)
.where(proxy_tbl.c.url == px)
.values(
in_use=_clamp_zero(new_in_use),
updated_at=now,
last_fail=None if ok else now,
)
)
_buffer: asyncio.Queue[tuple[str, bool]] = asyncio.Queue(maxsize=2048)
def queue_proxy_result(px: str, ok: bool) -> None:
try:
_buffer.put_nowait((px, ok))
except asyncio.QueueFull:
try:
_buffer.get_nowait()
_buffer.put_nowait((px, ok))
except Exception:
pass
async def _flusher() -> None:
while True:
await asyncio.sleep(0.4)
if _buffer.empty():
continue
batch: dict[str, tuple[int, int]] = {}
while not _buffer.empty():
px, ok = _buffer.get_nowait()
succ, fail = batch.get(px, (0, 0))
if ok:
succ += 1
else:
fail += 1
batch[px] = (succ, fail)
now = dt.datetime.now(timezone.utc)
with SessionLocal.begin() as s:
for px, (succ, fail) in batch.items():
delta = 0.1 * succ - 0.2 * fail
stmt = (
update(proxy_tbl)
.where(proxy_tbl.c.url == px)
.values(
score=_clamp_zero(proxy_tbl.c.score + delta),
fails=_clamp_zero(proxy_tbl.c.fails + fail - succ),
banned=(proxy_tbl.c.fails + fail) > 5,
updated_at=now,
)
)
s.execute(stmt)
def start_background_tasks(loop: asyncio.AbstractEventLoop) -> None:
loop.create_task(_flusher())
loop.create_task(asyncio.to_thread(_seed))
_WINDOW_N = 50
def add_dl_stat(ok: bool) -> None:
now = dt.datetime.now(timezone.utc)
with SessionLocal.begin() as s:
s.execute(insert(dl_stats).values(ok=ok, ts=now))
# -------- FIX ③ --------
oldest_keep = select(dl_stats.c.id).order_by(
dl_stats.c.id.desc()
).limit(500)
s.execute(
delete(dl_stats).where(~dl_stats.c.id.in_(oldest_keep))
)
def recent_success_rate(n: int = _WINDOW_N) -> float:
with SessionLocal() as s:
vals = (
s.execute(select(dl_stats.c.ok).order_by(dl_stats.c.id.desc()).limit(n))
.scalars()
.all()
)
return 0.5 if not vals else sum(vals) / len(vals)
def _inc(table: Table, ip: str) -> None:
now = dt.datetime.now(timezone.utc)
with SessionLocal.begin() as s:
row = s.execute(select(table).where(table.c.ip == ip)).first()
if not row:
s.execute(insert(table).values(ip=ip, count=1, updated_at=now))
else:
s.execute(
update(table)
.where(table.c.ip == ip)
.values(count=row.count + 1, updated_at=now)
)
def record_login(ip: str, success: bool) -> None:
if success:
with SessionLocal.begin() as s:
s.execute(update(login_tbl).where(login_tbl.c.ip == ip).values(count=0))
else:
_inc(login_tbl, ip)
def inc_invalid(ip: str) -> None:
_inc(invalid_tbl, ip)
def _over_limit(table: Table, ip: str, cap: int) -> bool:
with SessionLocal() as s:
row = s.execute(
select(table.c.count, table.c.updated_at).where(table.c.ip == ip)
).first()
if not row:
return False
count, ts = row
now = dt.datetime.now(timezone.utc)
if ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
if (now - ts).total_seconds() > _WINDOW_MINUTES * 60:
with SessionLocal.begin() as sx:
sx.execute(update(table).where(table.c.ip == ip).values(count=0))
return False
return count >= cap
def too_many_attempts(ip: str) -> bool:
return _over_limit(login_tbl, ip, _MAX_LOGIN_FAILS)
def invalid_over_limit(ip: str) -> bool:
return _over_limit(invalid_tbl, ip, _MAX_INVALID_URLS)
def pick_proxy() -> str | None:
return acquire_proxy()
def ensure_proxy(px: str) -> None:
with SessionLocal.begin() as s:
s.execute(_insert_ignore(proxy_tbl, url=px))
def update_proxy(px: str, ok: bool) -> None:
queue_proxy_result(px, ok)
async def init_proxy_seed() -> None:
await asyncio.to_thread(_seed)