refactor gc into store

this will make refactoring this easier later and limits the api surface of Store
This commit is contained in:
Anthony Sottile 2025-11-09 16:57:40 -05:00
parent 17cf886473
commit d5c273a2ba
4 changed files with 101 additions and 111 deletions

View file

@ -1,89 +1,9 @@
from __future__ import annotations
import os.path
from typing import Any
import pre_commit.constants as C
from pre_commit import output
from pre_commit.clientlib import InvalidConfigError
from pre_commit.clientlib import InvalidManifestError
from pre_commit.clientlib import load_config
from pre_commit.clientlib import load_manifest
from pre_commit.clientlib import LOCAL
from pre_commit.clientlib import META
from pre_commit.store import Store
def _mark_used_repos(
store: Store,
all_repos: dict[tuple[str, str], str],
unused_repos: set[tuple[str, str]],
repo: dict[str, Any],
) -> None:
if repo['repo'] == META:
return
elif repo['repo'] == LOCAL:
for hook in repo['hooks']:
deps = hook.get('additional_dependencies')
unused_repos.discard((
store.db_repo_name(repo['repo'], deps), C.LOCAL_REPO_VERSION,
))
else:
key = (repo['repo'], repo['rev'])
path = all_repos.get(key)
# can't inspect manifest if it isn't cloned
if path is None:
return
try:
manifest = load_manifest(os.path.join(path, C.MANIFEST_FILE))
except InvalidManifestError:
return
else:
unused_repos.discard(key)
by_id = {hook['id']: hook for hook in manifest}
for hook in repo['hooks']:
if hook['id'] not in by_id:
continue
deps = hook.get(
'additional_dependencies',
by_id[hook['id']]['additional_dependencies'],
)
unused_repos.discard((
store.db_repo_name(repo['repo'], deps), repo['rev'],
))
def _gc_repos(store: Store) -> int:
configs = store.select_all_configs()
repos = store.select_all_repos()
# delete config paths which do not exist
dead_configs = [p for p in configs if not os.path.exists(p)]
live_configs = [p for p in configs if os.path.exists(p)]
all_repos = {(repo, ref): path for repo, ref, path in repos}
unused_repos = set(all_repos)
for config_path in live_configs:
try:
config = load_config(config_path)
except InvalidConfigError:
dead_configs.append(config_path)
continue
else:
for repo in config['repos']:
_mark_used_repos(store, all_repos, unused_repos, repo)
store.delete_configs(dead_configs)
for db_repo_name, ref in unused_repos:
store.delete_repo(db_repo_name, ref, all_repos[(db_repo_name, ref)])
return len(unused_repos)
def gc(store: Store) -> int:
with store.exclusive_lock():
repos_removed = _gc_repos(store)
output.write_line(f'{repos_removed} repo(s) removed.')
output.write_line(f'{store.gc()} repo(s) removed.')
return 0

View file

@ -8,6 +8,7 @@ import tempfile
from collections.abc import Callable
from collections.abc import Generator
from collections.abc import Sequence
from typing import Any
import pre_commit.constants as C
from pre_commit import clientlib
@ -96,7 +97,7 @@ class Store:
' PRIMARY KEY (repo, ref)'
');',
)
self._create_config_table(db)
self._create_configs_table(db)
# Atomic file move
os.replace(tmpfile, self.db_path)
@ -215,7 +216,7 @@ class Store:
'local', C.LOCAL_REPO_VERSION, deps, _make_local_repo,
)
def _create_config_table(self, db: sqlite3.Connection) -> None:
def _create_configs_table(self, db: sqlite3.Connection) -> None:
db.executescript(
'CREATE TABLE IF NOT EXISTS configs ('
' path TEXT NOT NULL,'
@ -232,28 +233,83 @@ class Store:
return
with self.connect() as db:
# TODO: eventually remove this and only create in _create
self._create_config_table(db)
self._create_configs_table(db)
db.execute('INSERT OR IGNORE INTO configs VALUES (?)', (path,))
def select_all_configs(self) -> list[str]:
with self.connect() as db:
self._create_config_table(db)
rows = db.execute('SELECT path FROM configs').fetchall()
return [path for path, in rows]
def _mark_used_repos(
self,
all_repos: dict[tuple[str, str], str],
unused_repos: set[tuple[str, str]],
repo: dict[str, Any],
) -> None:
if repo['repo'] == clientlib.META:
return
elif repo['repo'] == clientlib.LOCAL:
for hook in repo['hooks']:
deps = hook.get('additional_dependencies')
unused_repos.discard((
self.db_repo_name(repo['repo'], deps),
C.LOCAL_REPO_VERSION,
))
else:
key = (repo['repo'], repo['rev'])
path = all_repos.get(key)
# can't inspect manifest if it isn't cloned
if path is None:
return
def delete_configs(self, configs: list[str]) -> None:
with self.connect() as db:
rows = [(path,) for path in configs]
db.executemany('DELETE FROM configs WHERE path = ?', rows)
try:
manifest = clientlib.load_manifest(
os.path.join(path, C.MANIFEST_FILE),
)
except clientlib.InvalidManifestError:
return
else:
unused_repos.discard(key)
by_id = {hook['id']: hook for hook in manifest}
def select_all_repos(self) -> list[tuple[str, str, str]]:
with self.connect() as db:
return db.execute('SELECT repo, ref, path from repos').fetchall()
for hook in repo['hooks']:
if hook['id'] not in by_id:
continue
def delete_repo(self, db_repo_name: str, ref: str, path: str) -> None:
with self.connect() as db:
db.execute(
deps = hook.get(
'additional_dependencies',
by_id[hook['id']]['additional_dependencies'],
)
unused_repos.discard((
self.db_repo_name(repo['repo'], deps), repo['rev'],
))
def gc(self) -> int:
with self.exclusive_lock(), self.connect() as db:
self._create_configs_table(db)
repos = db.execute('SELECT repo, ref, path FROM repos').fetchall()
all_repos = {(repo, ref): path for repo, ref, path in repos}
unused_repos = set(all_repos)
configs_rows = db.execute('SELECT path FROM configs').fetchall()
configs = [path for path, in configs_rows]
dead_configs = []
for config_path in configs:
try:
config = clientlib.load_config(config_path)
except clientlib.InvalidConfigError:
dead_configs.append(config_path)
continue
else:
for repo in config['repos']:
self._mark_used_repos(all_repos, unused_repos, repo)
paths = [(path,) for path in dead_configs]
db.executemany('DELETE FROM configs WHERE path = ?', paths)
db.executemany(
'DELETE FROM repos WHERE repo = ? and ref = ?',
(db_repo_name, ref),
sorted(unused_repos),
)
rmtree(path)
for k in unused_repos:
rmtree(all_repos[k])
return len(unused_repos)

View file

@ -19,11 +19,13 @@ from testing.util import git_commit
def _repo_count(store):
return len(store.select_all_repos())
with store.connect() as db:
return db.execute('SELECT COUNT(1) FROM repos').fetchone()[0]
def _config_count(store):
return len(store.select_all_configs())
with store.connect() as db:
return db.execute('SELECT COUNT(1) FROM configs').fetchone()[0]
def _remove_config_assert_cleared(store, cap_out):
@ -153,7 +155,8 @@ def test_invalid_manifest_gcd(tempdir_factory, store, in_git_dir, cap_out):
install_hooks(C.CONFIG_FILE, store)
# we'll "break" the manifest to simulate an old version clone
(_, _, path), = store.select_all_repos()
with store.connect() as db:
path, = db.execute('SELECT path FROM repos').fetchone()
os.remove(os.path.join(path, C.MANIFEST_FILE))
assert _config_count(store) == 1

View file

@ -22,6 +22,17 @@ from testing.util import git_commit
from testing.util import xfailif_windows
def _select_all_configs(store: Store) -> list[str]:
with store.connect() as db:
rows = db.execute('SELECT * FROM configs').fetchall()
return [path for path, in rows]
def _select_all_repos(store: Store) -> list[tuple[str, str, str]]:
with store.connect() as db:
return db.execute('SELECT repo, ref, path FROM repos').fetchall()
def test_our_session_fixture_works():
"""There's a session fixture which makes `Store` invariantly raise to
prevent writing to the home directory.
@ -91,7 +102,7 @@ def test_clone(store, tempdir_factory, caplog):
assert git.head_rev(ret) == rev
# Assert there's an entry in the sqlite db for this
assert store.select_all_repos() == [(path, rev, ret)]
assert _select_all_repos(store) == [(path, rev, ret)]
def test_warning_for_deprecated_stages_on_init(store, tempdir_factory, caplog):
@ -217,7 +228,7 @@ def test_clone_shallow_failure_fallback_to_complete(
assert git.head_rev(ret) == rev
# Assert there's an entry in the sqlite db for this
assert store.select_all_repos() == [(path, rev, ret)]
assert _select_all_repos(store) == [(path, rev, ret)]
def test_clone_tag_not_on_mainline(store, tempdir_factory):
@ -265,7 +276,7 @@ def test_mark_config_as_used(store, tmpdir):
with tmpdir.as_cwd():
f = tmpdir.join('f').ensure()
store.mark_config_used('f')
assert store.select_all_configs() == [f.strpath]
assert _select_all_configs(store) == [f.strpath]
def test_mark_config_as_used_idempotent(store, tmpdir):
@ -275,7 +286,7 @@ def test_mark_config_as_used_idempotent(store, tmpdir):
def test_mark_config_as_used_does_not_exist(store):
store.mark_config_used('f')
assert store.select_all_configs() == []
assert _select_all_configs(store) == []
def _simulate_pre_1_14_0(store):
@ -283,9 +294,9 @@ def _simulate_pre_1_14_0(store):
db.executescript('DROP TABLE configs')
def test_select_all_configs_roll_forward(store):
def test_gc_roll_forward(store):
_simulate_pre_1_14_0(store)
assert store.select_all_configs() == []
assert store.gc() == 0
def test_mark_config_as_used_roll_forward(store, tmpdir):
@ -314,7 +325,7 @@ def test_mark_config_as_used_readonly(tmpdir):
assert store.readonly
# should be skipped due to readonly
store.mark_config_used(str(cfg))
assert store.select_all_configs() == []
assert _select_all_configs(store) == []
def test_clone_with_recursive_submodules(store, tmp_path):