diff --git a/WORK.md b/WORK.md new file mode 100644 index 0000000..a9a02e6 --- /dev/null +++ b/WORK.md @@ -0,0 +1,78 @@ +# What’s going on inside `mathstream` + +Figured I’d write down how the whole thing is wired, more like a brain dump than polished docs. If you’re diving in to fix something, this should save you a bunch of spelunking. + +## Directory map (so you know where to poke) + +``` +mathstream/ + __init__.py # re-export central + engine.py # arithmetic guts + exceptions.py # custom errors + number.py # StreamNumber, manual GC, watcher + utils.py # sqlite junk drawer + +test.py # smoke/integration script +``` + +## StreamNumber – the heart of it + +`mathstream/number.py` owns the `StreamNumber` class. The class does a couple jobs: + +- wraps a file of digits (either you give us a path or a literal; literals get canonicalised and dropped into `instance/log/literal_.txt`). +- streaming happens via `.stream(chunk_size)` so we never load the whole thing; every time we read we call `touch_log_file` so the usage timestamp keeps moving. +- when a new stream gets created we check if it lives under `LOG_DIR`. If yes, we register it with the sqlite tracker (`register_log_file`) and also bump a ref counter via `register_reference`. +- there’s a weakref finaliser plus a global `_ACTIVE_COUNTER` that keeps tabs on python-side references. If the object falls out of scope we run `_finalize_instance`, which decrements the counter and if it was the last one we call `release_reference` (that may nuke the file instantly). +- explicit `free()` exists for people who want deterministic cleanup. It’s basically like `free()` in C: drop ref count and optionally delete the file now. There’s an alias `free_stream` and the class is a context manager so `with StreamNumber(...) as sn:` cleans up automatically. + +So any time you’ve got a staged result hanging around in memory, the watcher knows about it. Once you ditch it—either by `free()` or just letting the object die—the sqlite ref count drops. + +## Engine – maths without ints + +Living in `mathstream/engine.py`. All the operators (`add/sub/mul/div/mod/pow`) pull chunks from the `StreamNumber` inputs, normalise them into sign + digit strings, run grade-school algorithms, then write the result back into `LOG_DIR`. + +- `_write_result` is the important bit: writes to disk, calls `register_log_file`, then wraps the file in a new `StreamNumber`. Because of that call, every staged result is tracked automatically. +- We’re careful about signs: division and modulo follow Python’s floor division rules. Divide-by-zero is intercepted and converted into `DivideByZeroError`. +- `clear_logs()` wipes the folder and calls `wipe_log_records()` to empty sqlite so the next run isn’t polluted. + +## Exceptions + +`mathstream/exceptions.py` just defines `MathStreamError` and the more specific `DivideByZeroError`. Nothing fancy, just so we don’t leak raw `ZeroDivisionError`. + +## SQLite watcher (`mathstream/utils.py`) + +This is the garbage-collection HQ. On import we run `_ensure_db(reset=True)` so every run starts from a clean DB (no migrations, no surprises). Two tables: + +- `logs` → metadata about every staged file: created time, last access, access count. +- `refs` → current reference count (think “how many StreamNumber instances think they own this file”). + +Important functions: + +- `register_log_file(path)` – ensure both tables have a row (initial ref count 0). +- `register_reference(path)` – increments the ref count, updates last access, access count etc. Called whenever a new `StreamNumber` points at the staged file. +- `touch_log_file(path)` – called from `.stream()` so we know the file is being read. +- `release_reference(path, delete_file=True)` – the inverse of register. If the count hits zero we remove the DB row and (optionally) delete the file right away. +- `collect_garbage(score_threshold)` – this is the periodic sweeper. Computes `score = age / ((ref_count + 1) * (access_count + 1))`. Bigger score means older + less used. If score >= threshold it gets unlinked and removed from DB. Negative thresholds blow up on purpose. +- `tracked_files()` – dumb helper that dumps `{path: ref_count}` out of the DB. +- `wipe_log_records()` – nukes both tables; used by `clear_logs`. + +## How cleanup flows + +1. You run an operation (`add`, `mul`, whatever). Result file lands in `LOG_DIR`, gets registered, comes back as a `StreamNumber`. +2. You stream it or create more streams from it – metadata keeps getting updated via `touch_log_file`/`register_reference`. +3. When you’re done, call `.free()` or just drop references. Manual free is immediate. Otherwise the weakref finaliser catches it eventually. +4. `release_reference` is what actually removes the sqlite entries and unlinks the data file when there are no logical references left. +5. If you still have detritus (e.g. you crashed before refs hit zero), run `collect_garbage(threshold)` to sweep anything whose age outweighs usage. +6. `active_streams()` reports what’s still alive in Python land; `tracked_files()` shows what the DB thinks is referenced. + +## Example run (`test.py`) + +`test.py` is half regression, half reference script. It: + +- seeds some numbers, runs every operation, checks results. +- makes sure `DivideByZeroError` fires. +- frees every staged number to prove files vanish on the spot. +- runs `collect_garbage(0)` just to make sure nothing else lingers. +- dumps `active_streams()` and `tracked_files()` so you can see python vs sqlite state. + +If the logs ever seem suspicious, run that script—it’ll tell you immediately whether something’s still referenced or if the GC is forgetting to clean up. diff --git a/mathstream/README.md b/mathstream/README.md index f02ad9f..864044c 100644 --- a/mathstream/README.md +++ b/mathstream/README.md @@ -23,6 +23,7 @@ from mathstream import ( pow, is_even, is_odd, + free_stream, collect_garbage, ) @@ -39,6 +40,9 @@ print("power =", "".join(pow(a, e).stream())) print("a is even?", is_even(a)) print("b is odd?", is_odd(b)) +# drop staged artifacts immediately when you are done +free_stream(b) + # reclaim space for files whose age outweighs their use collect_garbage(0.5) ``` @@ -52,8 +56,9 @@ Each arithmetic call writes its result back into `instance/log` (configurable vi - **Automatic staging** – Outputs are stored under `LOG_DIR` with hashes based on input file paths, letting you compose operations without manual bookkeeping. - **Sign-aware** – Addition, subtraction, multiplication, division (`//` behavior), modulo, and exponentiation (non-negative exponents) all respect operand sign. Division/modulo follow Python’s floor-division rules. - **Utilities** – `clear_logs()` wipes prior staged results so you can start fresh. +- **Manual freeing** – Call `stream.free()` (or `free_stream(stream)`) once you are done with a staged number to release its reference immediately. Logger metadata keeps per-path reference counts so the final free removes the backing file on the spot. - **Parity helpers** – `is_even` and `is_odd` inspect the streamed digits without materializing the integer. -- **Garbage collection** – `collect_garbage(score_threshold)` computes a score from file age, access count, and reference count (tracked in `instance/mathstream_logs.sqlite`, freshly truncated each run). Files whose score meets or exceeds the threshold are deleted, letting you tune how aggressively to reclaim space. Both staged results and literal caches participate. +- **Garbage collection** – `collect_garbage(score_threshold)` computes a score from file age, access count, and reference count (tracked in `instance/mathstream_logs.sqlite`, freshly truncated each run). Files whose score meets or exceeds the threshold are deleted, letting you tune how aggressively to reclaim space. Both staged results and literal caches participate. Use `tracked_files()` or `active_streams()` to inspect current state. Divide-by-zero scenarios raise the custom `DivideByZeroError` so callers can distinguish mathstream issues from Python’s native exceptions. diff --git a/mathstream/__init__.py b/mathstream/__init__.py index 137182b..8814edf 100644 --- a/mathstream/__init__.py +++ b/mathstream/__init__.py @@ -1,11 +1,12 @@ from .engine import clear_logs, add, sub, mul, div, mod, pow, is_even, is_odd from .exceptions import MathStreamError, DivideByZeroError -from .number import StreamNumber -from .utils import collect_garbage +from .number import StreamNumber, free_stream, active_streams +from .utils import collect_garbage, tracked_files __all__ = [ "clear_logs", "collect_garbage", + "tracked_files", "add", "sub", "mul", @@ -15,6 +16,8 @@ __all__ = [ "is_even", "is_odd", "StreamNumber", + "free_stream", + "active_streams", "MathStreamError", "DivideByZeroError", ] diff --git a/mathstream/number.py b/mathstream/number.py index 4d7a832..1cad0bd 100644 --- a/mathstream/number.py +++ b/mathstream/number.py @@ -1,8 +1,15 @@ import hashlib +import weakref +from collections import Counter from pathlib import Path -from typing import Optional, Union +from typing import Dict, Optional, Union -from .utils import register_log_file, register_reference, touch_log_file +from .utils import ( + register_log_file, + register_reference, + touch_log_file, + release_reference, +) LOG_DIR = Path("./instance/log") @@ -60,11 +67,19 @@ class StreamNumber: if not self.path.exists(): raise FileNotFoundError(self.path) + self.hash = hashlib.sha1(str(self.path).encode()).hexdigest()[:10] + self._normalized_path = str(self.path.resolve()) + self._released = False + + _increment_active(self.path) + if _is_in_log_dir(self.path): register_log_file(self.path) register_reference(self.path) - self.hash = hashlib.sha1(str(self.path).encode()).hexdigest()[:10] + self._finalizer = weakref.finalize( + self, _finalize_instance, self._normalized_path + ) def __repr__(self): return f"" @@ -85,3 +100,52 @@ class StreamNumber: f.write(data.encode()) register_log_file(stage_file) return stage_file + + def free(self, *, delete_file: bool = True) -> None: + """Release this stream's reference and optionally delete the staged file.""" + if self._released: + return + self._released = True + if self._finalizer.alive: + self._finalizer.detach() + _decrement_active(Path(self._normalized_path), delete_file=delete_file) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + self.free() + + +_ACTIVE_COUNTER: Counter[str] = Counter() + + +def _increment_active(path: Path) -> None: + key = str(path.resolve()) + _ACTIVE_COUNTER[key] += 1 + + +def _decrement_active(path: Path, delete_file: bool = True) -> None: + key = str(path.resolve()) + current = _ACTIVE_COUNTER.get(key, 0) + if current <= 1: + _ACTIVE_COUNTER.pop(key, None) + else: + _ACTIVE_COUNTER[key] = current - 1 + + if _is_in_log_dir(path): + release_reference(path, delete_file=delete_file) + + +def _finalize_instance(path_str: str) -> None: + _decrement_active(Path(path_str)) + + +def free_stream(number: StreamNumber, *, delete_file: bool = True) -> None: + """Convenience helper mirroring manual memory management semantics.""" + number.free(delete_file=delete_file) + + +def active_streams() -> Dict[str, int]: + """Return the active StreamNumber paths mapped to in-memory reference counts.""" + return dict(_ACTIVE_COUNTER) diff --git a/mathstream/utils.py b/mathstream/utils.py index af8fdf9..b0d1120 100644 --- a/mathstream/utils.py +++ b/mathstream/utils.py @@ -1,7 +1,7 @@ import sqlite3 from datetime import datetime, timezone from pathlib import Path -from typing import Iterable, List +from typing import Iterable, List, Dict LOG_DB_PATH = Path("./instance/mathstream_logs.sqlite") @@ -178,3 +178,43 @@ def collect_garbage(score_threshold: float) -> list[Path]: _delete_records(removed) return removed + + +def release_reference(path: Path, delete_file: bool = True) -> bool: + """Decrease the reference count and optionally delete the file when it hits zero.""" + normalized = _normalize_paths([path])[0] + _ensure_db() + with sqlite3.connect(LOG_DB_PATH) as conn: + row = conn.execute( + "SELECT ref_count FROM refs WHERE path = ?", (normalized,) + ).fetchone() + if row is None: + return False + current = row[0] or 0 + new_count = max(current - 1, 0) + if new_count > 0: + conn.execute( + "UPDATE refs SET ref_count = ? WHERE path = ?", (new_count, normalized) + ) + conn.commit() + return False + conn.execute("DELETE FROM refs WHERE path = ?", (normalized,)) + conn.execute("DELETE FROM logs WHERE path = ?", (normalized,)) + conn.commit() + + removed = False + if delete_file and path.exists(): + try: + path.unlink() + removed = True + except OSError: + removed = False + return removed + + +def tracked_files() -> Dict[str, int]: + """Return a mapping of tracked file paths to their reference counts.""" + _ensure_db() + with sqlite3.connect(LOG_DB_PATH) as conn: + rows = conn.execute("SELECT path, ref_count FROM refs").fetchall() + return {path: ref_count for path, ref_count in rows}