added better garabge collection

This commit is contained in:
Dominik Krenn 2025-11-05 08:35:01 +01:00
parent df9b2b5f29
commit 443f9f4f4b
7 changed files with 254 additions and 4 deletions

View File

@ -23,6 +23,7 @@ from mathstream import (
pow,
is_even,
is_odd,
collect_garbage,
)
a = StreamNumber("instance/log/huge.txt")
@ -37,6 +38,9 @@ print("modulo =", "".join(mod(a, b).stream()))
print("power =", "".join(pow(a, e).stream()))
print("a is even?", is_even(a))
print("b is odd?", is_odd(b))
# reclaim space for files whose age outweighs their use
collect_garbage(0.5)
```
Each arithmetic call writes its result back into `instance/log` (configurable via `mathstream.number.LOG_DIR`) so you can stream the digits later or reuse them in further operations.
@ -49,6 +53,9 @@ Each arithmetic call writes its result back into `instance/log` (configurable vi
- **Sign-aware** Addition, subtraction, multiplication, division (`//` behavior), modulo, and exponentiation (non-negative exponents) all respect operand sign. Division/modulo follow Pythons floor-division rules.
- **Utilities** `clear_logs()` wipes prior staged results so you can start fresh.
- **Parity helpers** `is_even` and `is_odd` inspect the streamed digits without materializing the integer.
- **Garbage collection** `collect_garbage(score_threshold)` computes a score from file age, access count, and reference count (tracked in `instance/mathstream_logs.sqlite`, freshly truncated each run). Files whose score meets or exceeds the threshold are deleted, letting you tune how aggressively to reclaim space. Both staged results and literal caches participate.
Divide-by-zero scenarios raise the custom `DivideByZeroError` so callers can distinguish mathstream issues from Pythons native exceptions.
## Example Script

View File

@ -1,11 +1,20 @@
from .engine import clear_logs, add, sub, mul, div, mod, pow, is_even, is_odd
from .exceptions import MathStreamError, DivideByZeroError
from .number import StreamNumber
from .utils import collect_garbage
__all__ = [
"clear_logs",
"add", "sub",
"mul", "div", "mod",
"collect_garbage",
"add",
"sub",
"mul",
"div",
"mod",
"pow",
"is_even", "is_odd",
"is_even",
"is_odd",
"StreamNumber",
"MathStreamError",
"DivideByZeroError",
]

View File

@ -2,7 +2,9 @@ from __future__ import annotations
from typing import Iterable, Tuple
from .exceptions import DivideByZeroError
from .number import StreamNumber, LOG_DIR
from .utils import register_log_file, wipe_log_records
def _ensure_log_dir() -> None:
@ -123,7 +125,7 @@ def _multiply_digit(num: str, digit: int) -> str:
def _divide_abs(dividend: str, divisor: str) -> Tuple[str, str]:
if divisor == "0":
raise ZeroDivisionError("division by zero")
raise DivideByZeroError("division by zero")
if dividend == "0":
return "0", "0"
@ -171,6 +173,7 @@ def _write_result(operation: str, operands: Iterable[StreamNumber], digits: str)
out_file = LOG_DIR / f"{operation}_{operand_hash}.bin"
with open(out_file, "w", encoding="utf-8") as out:
out.write(digits)
register_log_file(out_file)
return StreamNumber(out_file)
@ -179,6 +182,7 @@ def clear_logs():
for p in LOG_DIR.glob("*"):
p.unlink()
_ensure_log_dir()
wipe_log_records()
def add(num_a: StreamNumber, num_b: StreamNumber) -> StreamNumber:
@ -266,6 +270,9 @@ def mod(num_a: StreamNumber, num_b: StreamNumber) -> StreamNumber:
sign_a, a_digits = _normalize_stream(num_a)
sign_b, b_digits = _normalize_stream(num_b)
if b_digits == "0":
raise DivideByZeroError("modulo by zero")
_, remainder = _divide_abs(a_digits, b_digits)
if remainder == "0":

6
mathstream/exceptions.py Normal file
View File

@ -0,0 +1,6 @@
class MathStreamError(Exception):
"""Base class for mathstream-specific errors."""
class DivideByZeroError(MathStreamError):
"""Raised when division or modulo operations encounter a zero divisor."""

View File

@ -2,6 +2,8 @@ import hashlib
from pathlib import Path
from typing import Optional, Union
from .utils import register_log_file, register_reference, touch_log_file
LOG_DIR = Path("./instance/log")
@ -29,6 +31,14 @@ def _canonicalize_literal(value: str) -> str:
return f"{sign}{digits}"
def _is_in_log_dir(path: Path) -> bool:
try:
path.resolve().relative_to(LOG_DIR.resolve())
return True
except ValueError:
return False
class StreamNumber:
def __init__(
self,
@ -50,6 +60,10 @@ class StreamNumber:
if not self.path.exists():
raise FileNotFoundError(self.path)
if _is_in_log_dir(self.path):
register_log_file(self.path)
register_reference(self.path)
self.hash = hashlib.sha1(str(self.path).encode()).hexdigest()[:10]
def __repr__(self):
@ -57,6 +71,8 @@ class StreamNumber:
def stream(self, chunk_size=4096):
"""Yield chunks of digits as strings."""
if _is_in_log_dir(self.path):
touch_log_file(self.path)
with open(self.path, "r", encoding="utf-8") as f:
while chunk := f.read(chunk_size):
yield chunk.strip().replace(",", ".")
@ -67,4 +83,5 @@ class StreamNumber:
stage_file = LOG_DIR / f"{self.hash}_stage_{stage}.bin"
with open(stage_file, "wb") as f:
f.write(data.encode())
register_log_file(stage_file)
return stage_file

View File

@ -0,0 +1,180 @@
import sqlite3
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable, List
LOG_DB_PATH = Path("./instance/mathstream_logs.sqlite")
def _normalize_paths(paths: Iterable[Path]) -> List[str]:
return [str(Path(p).resolve()) for p in paths]
def _ensure_db(reset: bool = False) -> None:
LOG_DB_PATH.parent.mkdir(parents=True, exist_ok=True)
with sqlite3.connect(LOG_DB_PATH) as conn:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS logs (
path TEXT PRIMARY KEY,
created_at REAL,
last_access REAL,
access_count INTEGER DEFAULT 0
)
"""
)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS refs (
path TEXT PRIMARY KEY,
ref_count INTEGER DEFAULT 0
)
"""
)
if reset:
conn.execute("DELETE FROM logs")
conn.execute("DELETE FROM refs")
conn.commit()
_ensure_db(reset=True)
def register_log_file(path: Path) -> None:
"""Ensure the log database is aware of a file's existence."""
normalized = _normalize_paths([path])[0]
_ensure_db()
timestamp = datetime.now(timezone.utc).timestamp()
with sqlite3.connect(LOG_DB_PATH) as conn:
conn.execute(
"""
INSERT INTO logs (path, created_at, last_access, access_count)
VALUES (?, ?, ?, 0)
ON CONFLICT(path)
DO NOTHING
""",
(normalized, timestamp, timestamp),
)
conn.execute(
"""
INSERT INTO refs (path, ref_count)
VALUES (?, 0)
ON CONFLICT(path)
DO NOTHING
""",
(normalized,),
)
conn.commit()
def register_reference(path: Path) -> None:
"""Increment reference count similarly to Python's ref counter."""
normalized = _normalize_paths([path])[0]
_ensure_db()
timestamp = datetime.now(timezone.utc).timestamp()
with sqlite3.connect(LOG_DB_PATH) as conn:
conn.execute(
"""
INSERT INTO logs (path, created_at, last_access, access_count)
VALUES (?, ?, ?, 1)
ON CONFLICT(path)
DO NOTHING
""",
(normalized, timestamp, timestamp),
)
conn.execute(
"""
INSERT INTO refs (path, ref_count)
VALUES (?, 1)
ON CONFLICT(path)
DO UPDATE SET ref_count = ref_count + 1
""",
(normalized,),
)
conn.execute(
"""
UPDATE logs
SET last_access = ?, access_count = access_count + 1
WHERE path = ?
""",
(timestamp, normalized),
)
conn.commit()
def touch_log_file(path: Path) -> None:
"""Refresh access metadata when a file is streamed."""
normalized = _normalize_paths([path])[0]
_ensure_db()
timestamp = datetime.now(timezone.utc).timestamp()
with sqlite3.connect(LOG_DB_PATH) as conn:
conn.execute(
"""
INSERT INTO logs (path, created_at, last_access, access_count)
VALUES (?, ?, ?, 1)
ON CONFLICT(path)
DO UPDATE SET
last_access = excluded.last_access,
access_count = logs.access_count + 1
""",
(normalized, timestamp, timestamp),
)
conn.commit()
def wipe_log_records() -> None:
"""Drop all bookkeeping (used after manual log purges)."""
_ensure_db()
with sqlite3.connect(LOG_DB_PATH) as conn:
conn.execute("DELETE FROM logs")
conn.execute("DELETE FROM refs")
conn.commit()
def _delete_records(paths: List[Path]) -> None:
if not paths:
return
normalized = [(str(p.resolve()),) for p in paths]
with sqlite3.connect(LOG_DB_PATH) as conn:
conn.executemany("DELETE FROM logs WHERE path = ?", normalized)
conn.executemany("DELETE FROM refs WHERE path = ?", normalized)
conn.commit()
def collect_garbage(score_threshold: float) -> list[Path]:
"""Remove seldom-used staged files based on an age/refcount score."""
if score_threshold < 0:
raise ValueError("score_threshold must be non-negative")
_ensure_db()
now = datetime.now(timezone.utc).timestamp()
with sqlite3.connect(LOG_DB_PATH) as conn:
rows = conn.execute(
"""
SELECT
l.path,
COALESCE(l.created_at, ?),
COALESCE(l.last_access, l.created_at, ?),
COALESCE(l.access_count, 0),
COALESCE(r.ref_count, 0)
FROM logs l
LEFT JOIN refs r ON l.path = r.path
""",
(now, now),
).fetchall()
removed: list[Path] = []
for path_str, created_at, last_access, access_count, ref_count in rows:
path = Path(path_str)
age = now - (last_access or created_at or now)
score = age / ((ref_count + 1) * (access_count + 1))
if score < score_threshold:
continue
if path.exists():
try:
path.unlink()
except OSError:
continue
removed.append(path)
_delete_records(removed)
return removed

24
test.py
View File

@ -13,6 +13,8 @@ from mathstream import (
is_even,
is_odd,
clear_logs,
collect_garbage,
DivideByZeroError,
)
NUMBERS_DIR = Path(__file__).parent / "tests"
@ -54,6 +56,7 @@ def main() -> None:
negative_divisor = write_number("neg_divisor", "-34567")
literal_even = StreamNumber(literal="2000")
literal_odd = StreamNumber(literal="-3")
zero_literal = StreamNumber(literal="0")
# Showcase the core operations.
total = add(big, small)
@ -85,6 +88,27 @@ def main() -> None:
check_bool("is_even(literal_even)", is_even(literal_even), True)
check_bool("is_odd(literal_odd)", is_odd(literal_odd), True)
# Custom exception coverage
try:
div(literal_even, zero_literal)
except DivideByZeroError:
print("div(literal_even, zero_literal) raised DivideByZeroError as expected")
else:
raise AssertionError("div by zero did not raise DivideByZeroError")
try:
mod(literal_even, zero_literal)
except DivideByZeroError:
print("mod(literal_even, zero_literal) raised DivideByZeroError as expected")
else:
raise AssertionError("mod by zero did not raise DivideByZeroError")
removed = collect_garbage(0)
print(f"collect_garbage removed {len(removed)} files")
check_bool("total exists post GC", total.path.exists(), False)
check_bool("literal_even exists post GC", literal_even.path.exists(), False)
check_bool("huge operand persists", big.path.exists(), True)
if __name__ == "__main__":
main()