From 443f9f4f4bc370ba5180d6f645df3ce595940805 Mon Sep 17 00:00:00 2001 From: Dominik Krenn Date: Wed, 5 Nov 2025 08:35:01 +0100 Subject: [PATCH] added better garabge collection --- mathstream/README.md | 7 ++ mathstream/__init__.py | 15 +++- mathstream/engine.py | 9 +- mathstream/exceptions.py | 6 ++ mathstream/number.py | 17 ++++ mathstream/utils.py | 180 +++++++++++++++++++++++++++++++++++++++ test.py | 24 ++++++ 7 files changed, 254 insertions(+), 4 deletions(-) create mode 100644 mathstream/exceptions.py diff --git a/mathstream/README.md b/mathstream/README.md index 6602dd5..f02ad9f 100644 --- a/mathstream/README.md +++ b/mathstream/README.md @@ -23,6 +23,7 @@ from mathstream import ( pow, is_even, is_odd, + collect_garbage, ) a = StreamNumber("instance/log/huge.txt") @@ -37,6 +38,9 @@ print("modulo =", "".join(mod(a, b).stream())) print("power =", "".join(pow(a, e).stream())) print("a is even?", is_even(a)) print("b is odd?", is_odd(b)) + +# reclaim space for files whose age outweighs their use +collect_garbage(0.5) ``` Each arithmetic call writes its result back into `instance/log` (configurable via `mathstream.number.LOG_DIR`) so you can stream the digits later or reuse them in further operations. @@ -49,6 +53,9 @@ Each arithmetic call writes its result back into `instance/log` (configurable vi - **Sign-aware** – Addition, subtraction, multiplication, division (`//` behavior), modulo, and exponentiation (non-negative exponents) all respect operand sign. Division/modulo follow Python’s floor-division rules. - **Utilities** – `clear_logs()` wipes prior staged results so you can start fresh. - **Parity helpers** – `is_even` and `is_odd` inspect the streamed digits without materializing the integer. +- **Garbage collection** – `collect_garbage(score_threshold)` computes a score from file age, access count, and reference count (tracked in `instance/mathstream_logs.sqlite`, freshly truncated each run). Files whose score meets or exceeds the threshold are deleted, letting you tune how aggressively to reclaim space. Both staged results and literal caches participate. + +Divide-by-zero scenarios raise the custom `DivideByZeroError` so callers can distinguish mathstream issues from Python’s native exceptions. ## Example Script diff --git a/mathstream/__init__.py b/mathstream/__init__.py index 3285c4a..137182b 100644 --- a/mathstream/__init__.py +++ b/mathstream/__init__.py @@ -1,11 +1,20 @@ from .engine import clear_logs, add, sub, mul, div, mod, pow, is_even, is_odd +from .exceptions import MathStreamError, DivideByZeroError from .number import StreamNumber +from .utils import collect_garbage __all__ = [ "clear_logs", - "add", "sub", - "mul", "div", "mod", + "collect_garbage", + "add", + "sub", + "mul", + "div", + "mod", "pow", - "is_even", "is_odd", + "is_even", + "is_odd", "StreamNumber", + "MathStreamError", + "DivideByZeroError", ] diff --git a/mathstream/engine.py b/mathstream/engine.py index 3691df8..210cb27 100644 --- a/mathstream/engine.py +++ b/mathstream/engine.py @@ -2,7 +2,9 @@ from __future__ import annotations from typing import Iterable, Tuple +from .exceptions import DivideByZeroError from .number import StreamNumber, LOG_DIR +from .utils import register_log_file, wipe_log_records def _ensure_log_dir() -> None: @@ -123,7 +125,7 @@ def _multiply_digit(num: str, digit: int) -> str: def _divide_abs(dividend: str, divisor: str) -> Tuple[str, str]: if divisor == "0": - raise ZeroDivisionError("division by zero") + raise DivideByZeroError("division by zero") if dividend == "0": return "0", "0" @@ -171,6 +173,7 @@ def _write_result(operation: str, operands: Iterable[StreamNumber], digits: str) out_file = LOG_DIR / f"{operation}_{operand_hash}.bin" with open(out_file, "w", encoding="utf-8") as out: out.write(digits) + register_log_file(out_file) return StreamNumber(out_file) @@ -179,6 +182,7 @@ def clear_logs(): for p in LOG_DIR.glob("*"): p.unlink() _ensure_log_dir() + wipe_log_records() def add(num_a: StreamNumber, num_b: StreamNumber) -> StreamNumber: @@ -266,6 +270,9 @@ def mod(num_a: StreamNumber, num_b: StreamNumber) -> StreamNumber: sign_a, a_digits = _normalize_stream(num_a) sign_b, b_digits = _normalize_stream(num_b) + if b_digits == "0": + raise DivideByZeroError("modulo by zero") + _, remainder = _divide_abs(a_digits, b_digits) if remainder == "0": diff --git a/mathstream/exceptions.py b/mathstream/exceptions.py new file mode 100644 index 0000000..d8fd35c --- /dev/null +++ b/mathstream/exceptions.py @@ -0,0 +1,6 @@ +class MathStreamError(Exception): + """Base class for mathstream-specific errors.""" + + +class DivideByZeroError(MathStreamError): + """Raised when division or modulo operations encounter a zero divisor.""" diff --git a/mathstream/number.py b/mathstream/number.py index 22d1cf4..4d7a832 100644 --- a/mathstream/number.py +++ b/mathstream/number.py @@ -2,6 +2,8 @@ import hashlib from pathlib import Path from typing import Optional, Union +from .utils import register_log_file, register_reference, touch_log_file + LOG_DIR = Path("./instance/log") @@ -29,6 +31,14 @@ def _canonicalize_literal(value: str) -> str: return f"{sign}{digits}" +def _is_in_log_dir(path: Path) -> bool: + try: + path.resolve().relative_to(LOG_DIR.resolve()) + return True + except ValueError: + return False + + class StreamNumber: def __init__( self, @@ -50,6 +60,10 @@ class StreamNumber: if not self.path.exists(): raise FileNotFoundError(self.path) + if _is_in_log_dir(self.path): + register_log_file(self.path) + register_reference(self.path) + self.hash = hashlib.sha1(str(self.path).encode()).hexdigest()[:10] def __repr__(self): @@ -57,6 +71,8 @@ class StreamNumber: def stream(self, chunk_size=4096): """Yield chunks of digits as strings.""" + if _is_in_log_dir(self.path): + touch_log_file(self.path) with open(self.path, "r", encoding="utf-8") as f: while chunk := f.read(chunk_size): yield chunk.strip().replace(",", ".") @@ -67,4 +83,5 @@ class StreamNumber: stage_file = LOG_DIR / f"{self.hash}_stage_{stage}.bin" with open(stage_file, "wb") as f: f.write(data.encode()) + register_log_file(stage_file) return stage_file diff --git a/mathstream/utils.py b/mathstream/utils.py index e69de29..af8fdf9 100644 --- a/mathstream/utils.py +++ b/mathstream/utils.py @@ -0,0 +1,180 @@ +import sqlite3 +from datetime import datetime, timezone +from pathlib import Path +from typing import Iterable, List + +LOG_DB_PATH = Path("./instance/mathstream_logs.sqlite") + + +def _normalize_paths(paths: Iterable[Path]) -> List[str]: + return [str(Path(p).resolve()) for p in paths] + + +def _ensure_db(reset: bool = False) -> None: + LOG_DB_PATH.parent.mkdir(parents=True, exist_ok=True) + with sqlite3.connect(LOG_DB_PATH) as conn: + conn.execute( + """ + CREATE TABLE IF NOT EXISTS logs ( + path TEXT PRIMARY KEY, + created_at REAL, + last_access REAL, + access_count INTEGER DEFAULT 0 + ) + """ + ) + conn.execute( + """ + CREATE TABLE IF NOT EXISTS refs ( + path TEXT PRIMARY KEY, + ref_count INTEGER DEFAULT 0 + ) + """ + ) + if reset: + conn.execute("DELETE FROM logs") + conn.execute("DELETE FROM refs") + conn.commit() + + +_ensure_db(reset=True) + + +def register_log_file(path: Path) -> None: + """Ensure the log database is aware of a file's existence.""" + normalized = _normalize_paths([path])[0] + _ensure_db() + timestamp = datetime.now(timezone.utc).timestamp() + with sqlite3.connect(LOG_DB_PATH) as conn: + conn.execute( + """ + INSERT INTO logs (path, created_at, last_access, access_count) + VALUES (?, ?, ?, 0) + ON CONFLICT(path) + DO NOTHING + """, + (normalized, timestamp, timestamp), + ) + conn.execute( + """ + INSERT INTO refs (path, ref_count) + VALUES (?, 0) + ON CONFLICT(path) + DO NOTHING + """, + (normalized,), + ) + conn.commit() + + +def register_reference(path: Path) -> None: + """Increment reference count similarly to Python's ref counter.""" + normalized = _normalize_paths([path])[0] + _ensure_db() + timestamp = datetime.now(timezone.utc).timestamp() + with sqlite3.connect(LOG_DB_PATH) as conn: + conn.execute( + """ + INSERT INTO logs (path, created_at, last_access, access_count) + VALUES (?, ?, ?, 1) + ON CONFLICT(path) + DO NOTHING + """, + (normalized, timestamp, timestamp), + ) + conn.execute( + """ + INSERT INTO refs (path, ref_count) + VALUES (?, 1) + ON CONFLICT(path) + DO UPDATE SET ref_count = ref_count + 1 + """, + (normalized,), + ) + conn.execute( + """ + UPDATE logs + SET last_access = ?, access_count = access_count + 1 + WHERE path = ? + """, + (timestamp, normalized), + ) + conn.commit() + + +def touch_log_file(path: Path) -> None: + """Refresh access metadata when a file is streamed.""" + normalized = _normalize_paths([path])[0] + _ensure_db() + timestamp = datetime.now(timezone.utc).timestamp() + with sqlite3.connect(LOG_DB_PATH) as conn: + conn.execute( + """ + INSERT INTO logs (path, created_at, last_access, access_count) + VALUES (?, ?, ?, 1) + ON CONFLICT(path) + DO UPDATE SET + last_access = excluded.last_access, + access_count = logs.access_count + 1 + """, + (normalized, timestamp, timestamp), + ) + conn.commit() + + +def wipe_log_records() -> None: + """Drop all bookkeeping (used after manual log purges).""" + _ensure_db() + with sqlite3.connect(LOG_DB_PATH) as conn: + conn.execute("DELETE FROM logs") + conn.execute("DELETE FROM refs") + conn.commit() + + +def _delete_records(paths: List[Path]) -> None: + if not paths: + return + normalized = [(str(p.resolve()),) for p in paths] + with sqlite3.connect(LOG_DB_PATH) as conn: + conn.executemany("DELETE FROM logs WHERE path = ?", normalized) + conn.executemany("DELETE FROM refs WHERE path = ?", normalized) + conn.commit() + + +def collect_garbage(score_threshold: float) -> list[Path]: + """Remove seldom-used staged files based on an age/refcount score.""" + if score_threshold < 0: + raise ValueError("score_threshold must be non-negative") + _ensure_db() + now = datetime.now(timezone.utc).timestamp() + with sqlite3.connect(LOG_DB_PATH) as conn: + rows = conn.execute( + """ + SELECT + l.path, + COALESCE(l.created_at, ?), + COALESCE(l.last_access, l.created_at, ?), + COALESCE(l.access_count, 0), + COALESCE(r.ref_count, 0) + FROM logs l + LEFT JOIN refs r ON l.path = r.path + """, + (now, now), + ).fetchall() + + removed: list[Path] = [] + for path_str, created_at, last_access, access_count, ref_count in rows: + path = Path(path_str) + age = now - (last_access or created_at or now) + score = age / ((ref_count + 1) * (access_count + 1)) + if score < score_threshold: + continue + if path.exists(): + try: + path.unlink() + except OSError: + continue + removed.append(path) + + _delete_records(removed) + return removed diff --git a/test.py b/test.py index 26c13b7..dc520db 100644 --- a/test.py +++ b/test.py @@ -13,6 +13,8 @@ from mathstream import ( is_even, is_odd, clear_logs, + collect_garbage, + DivideByZeroError, ) NUMBERS_DIR = Path(__file__).parent / "tests" @@ -54,6 +56,7 @@ def main() -> None: negative_divisor = write_number("neg_divisor", "-34567") literal_even = StreamNumber(literal="2000") literal_odd = StreamNumber(literal="-3") + zero_literal = StreamNumber(literal="0") # Showcase the core operations. total = add(big, small) @@ -85,6 +88,27 @@ def main() -> None: check_bool("is_even(literal_even)", is_even(literal_even), True) check_bool("is_odd(literal_odd)", is_odd(literal_odd), True) + # Custom exception coverage + try: + div(literal_even, zero_literal) + except DivideByZeroError: + print("div(literal_even, zero_literal) raised DivideByZeroError as expected") + else: + raise AssertionError("div by zero did not raise DivideByZeroError") + + try: + mod(literal_even, zero_literal) + except DivideByZeroError: + print("mod(literal_even, zero_literal) raised DivideByZeroError as expected") + else: + raise AssertionError("mod by zero did not raise DivideByZeroError") + + removed = collect_garbage(0) + print(f"collect_garbage removed {len(removed)} files") + check_bool("total exists post GC", total.path.exists(), False) + check_bool("literal_even exists post GC", literal_even.path.exists(), False) + check_bool("huge operand persists", big.path.exists(), True) + if __name__ == "__main__": main()