Merge pull request #3578 from pre-commit/store-gc-refactor
Some checks are pending
languages / vars (push) Waiting to run
languages / language (push) Blocked by required conditions
languages / collector (push) Blocked by required conditions
main / main-windows (push) Waiting to run
main / main-linux (push) Waiting to run

refactor gc into store
This commit is contained in:
anthony sottile 2025-11-09 17:16:27 -05:00 committed by GitHub
commit 49e28eea48
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 101 additions and 111 deletions

View file

@ -1,89 +1,9 @@
from __future__ import annotations from __future__ import annotations
import os.path
from typing import Any
import pre_commit.constants as C
from pre_commit import output from pre_commit import output
from pre_commit.clientlib import InvalidConfigError
from pre_commit.clientlib import InvalidManifestError
from pre_commit.clientlib import load_config
from pre_commit.clientlib import load_manifest
from pre_commit.clientlib import LOCAL
from pre_commit.clientlib import META
from pre_commit.store import Store from pre_commit.store import Store
def _mark_used_repos(
store: Store,
all_repos: dict[tuple[str, str], str],
unused_repos: set[tuple[str, str]],
repo: dict[str, Any],
) -> None:
if repo['repo'] == META:
return
elif repo['repo'] == LOCAL:
for hook in repo['hooks']:
deps = hook.get('additional_dependencies')
unused_repos.discard((
store.db_repo_name(repo['repo'], deps), C.LOCAL_REPO_VERSION,
))
else:
key = (repo['repo'], repo['rev'])
path = all_repos.get(key)
# can't inspect manifest if it isn't cloned
if path is None:
return
try:
manifest = load_manifest(os.path.join(path, C.MANIFEST_FILE))
except InvalidManifestError:
return
else:
unused_repos.discard(key)
by_id = {hook['id']: hook for hook in manifest}
for hook in repo['hooks']:
if hook['id'] not in by_id:
continue
deps = hook.get(
'additional_dependencies',
by_id[hook['id']]['additional_dependencies'],
)
unused_repos.discard((
store.db_repo_name(repo['repo'], deps), repo['rev'],
))
def _gc_repos(store: Store) -> int:
configs = store.select_all_configs()
repos = store.select_all_repos()
# delete config paths which do not exist
dead_configs = [p for p in configs if not os.path.exists(p)]
live_configs = [p for p in configs if os.path.exists(p)]
all_repos = {(repo, ref): path for repo, ref, path in repos}
unused_repos = set(all_repos)
for config_path in live_configs:
try:
config = load_config(config_path)
except InvalidConfigError:
dead_configs.append(config_path)
continue
else:
for repo in config['repos']:
_mark_used_repos(store, all_repos, unused_repos, repo)
store.delete_configs(dead_configs)
for db_repo_name, ref in unused_repos:
store.delete_repo(db_repo_name, ref, all_repos[(db_repo_name, ref)])
return len(unused_repos)
def gc(store: Store) -> int: def gc(store: Store) -> int:
with store.exclusive_lock(): output.write_line(f'{store.gc()} repo(s) removed.')
repos_removed = _gc_repos(store)
output.write_line(f'{repos_removed} repo(s) removed.')
return 0 return 0

View file

@ -8,6 +8,7 @@ import tempfile
from collections.abc import Callable from collections.abc import Callable
from collections.abc import Generator from collections.abc import Generator
from collections.abc import Sequence from collections.abc import Sequence
from typing import Any
import pre_commit.constants as C import pre_commit.constants as C
from pre_commit import clientlib from pre_commit import clientlib
@ -96,7 +97,7 @@ class Store:
' PRIMARY KEY (repo, ref)' ' PRIMARY KEY (repo, ref)'
');', ');',
) )
self._create_config_table(db) self._create_configs_table(db)
# Atomic file move # Atomic file move
os.replace(tmpfile, self.db_path) os.replace(tmpfile, self.db_path)
@ -215,7 +216,7 @@ class Store:
'local', C.LOCAL_REPO_VERSION, deps, _make_local_repo, 'local', C.LOCAL_REPO_VERSION, deps, _make_local_repo,
) )
def _create_config_table(self, db: sqlite3.Connection) -> None: def _create_configs_table(self, db: sqlite3.Connection) -> None:
db.executescript( db.executescript(
'CREATE TABLE IF NOT EXISTS configs (' 'CREATE TABLE IF NOT EXISTS configs ('
' path TEXT NOT NULL,' ' path TEXT NOT NULL,'
@ -232,28 +233,83 @@ class Store:
return return
with self.connect() as db: with self.connect() as db:
# TODO: eventually remove this and only create in _create # TODO: eventually remove this and only create in _create
self._create_config_table(db) self._create_configs_table(db)
db.execute('INSERT OR IGNORE INTO configs VALUES (?)', (path,)) db.execute('INSERT OR IGNORE INTO configs VALUES (?)', (path,))
def select_all_configs(self) -> list[str]: def _mark_used_repos(
with self.connect() as db: self,
self._create_config_table(db) all_repos: dict[tuple[str, str], str],
rows = db.execute('SELECT path FROM configs').fetchall() unused_repos: set[tuple[str, str]],
return [path for path, in rows] repo: dict[str, Any],
) -> None:
if repo['repo'] == clientlib.META:
return
elif repo['repo'] == clientlib.LOCAL:
for hook in repo['hooks']:
deps = hook.get('additional_dependencies')
unused_repos.discard((
self.db_repo_name(repo['repo'], deps),
C.LOCAL_REPO_VERSION,
))
else:
key = (repo['repo'], repo['rev'])
path = all_repos.get(key)
# can't inspect manifest if it isn't cloned
if path is None:
return
def delete_configs(self, configs: list[str]) -> None: try:
with self.connect() as db: manifest = clientlib.load_manifest(
rows = [(path,) for path in configs] os.path.join(path, C.MANIFEST_FILE),
db.executemany('DELETE FROM configs WHERE path = ?', rows) )
except clientlib.InvalidManifestError:
return
else:
unused_repos.discard(key)
by_id = {hook['id']: hook for hook in manifest}
def select_all_repos(self) -> list[tuple[str, str, str]]: for hook in repo['hooks']:
with self.connect() as db: if hook['id'] not in by_id:
return db.execute('SELECT repo, ref, path from repos').fetchall() continue
def delete_repo(self, db_repo_name: str, ref: str, path: str) -> None: deps = hook.get(
with self.connect() as db: 'additional_dependencies',
db.execute( by_id[hook['id']]['additional_dependencies'],
)
unused_repos.discard((
self.db_repo_name(repo['repo'], deps), repo['rev'],
))
def gc(self) -> int:
with self.exclusive_lock(), self.connect() as db:
self._create_configs_table(db)
repos = db.execute('SELECT repo, ref, path FROM repos').fetchall()
all_repos = {(repo, ref): path for repo, ref, path in repos}
unused_repos = set(all_repos)
configs_rows = db.execute('SELECT path FROM configs').fetchall()
configs = [path for path, in configs_rows]
dead_configs = []
for config_path in configs:
try:
config = clientlib.load_config(config_path)
except clientlib.InvalidConfigError:
dead_configs.append(config_path)
continue
else:
for repo in config['repos']:
self._mark_used_repos(all_repos, unused_repos, repo)
paths = [(path,) for path in dead_configs]
db.executemany('DELETE FROM configs WHERE path = ?', paths)
db.executemany(
'DELETE FROM repos WHERE repo = ? and ref = ?', 'DELETE FROM repos WHERE repo = ? and ref = ?',
(db_repo_name, ref), sorted(unused_repos),
) )
rmtree(path) for k in unused_repos:
rmtree(all_repos[k])
return len(unused_repos)

View file

@ -19,11 +19,13 @@ from testing.util import git_commit
def _repo_count(store): def _repo_count(store):
return len(store.select_all_repos()) with store.connect() as db:
return db.execute('SELECT COUNT(1) FROM repos').fetchone()[0]
def _config_count(store): def _config_count(store):
return len(store.select_all_configs()) with store.connect() as db:
return db.execute('SELECT COUNT(1) FROM configs').fetchone()[0]
def _remove_config_assert_cleared(store, cap_out): def _remove_config_assert_cleared(store, cap_out):
@ -153,7 +155,8 @@ def test_invalid_manifest_gcd(tempdir_factory, store, in_git_dir, cap_out):
install_hooks(C.CONFIG_FILE, store) install_hooks(C.CONFIG_FILE, store)
# we'll "break" the manifest to simulate an old version clone # we'll "break" the manifest to simulate an old version clone
(_, _, path), = store.select_all_repos() with store.connect() as db:
path, = db.execute('SELECT path FROM repos').fetchone()
os.remove(os.path.join(path, C.MANIFEST_FILE)) os.remove(os.path.join(path, C.MANIFEST_FILE))
assert _config_count(store) == 1 assert _config_count(store) == 1

View file

@ -22,6 +22,17 @@ from testing.util import git_commit
from testing.util import xfailif_windows from testing.util import xfailif_windows
def _select_all_configs(store: Store) -> list[str]:
with store.connect() as db:
rows = db.execute('SELECT * FROM configs').fetchall()
return [path for path, in rows]
def _select_all_repos(store: Store) -> list[tuple[str, str, str]]:
with store.connect() as db:
return db.execute('SELECT repo, ref, path FROM repos').fetchall()
def test_our_session_fixture_works(): def test_our_session_fixture_works():
"""There's a session fixture which makes `Store` invariantly raise to """There's a session fixture which makes `Store` invariantly raise to
prevent writing to the home directory. prevent writing to the home directory.
@ -91,7 +102,7 @@ def test_clone(store, tempdir_factory, caplog):
assert git.head_rev(ret) == rev assert git.head_rev(ret) == rev
# Assert there's an entry in the sqlite db for this # Assert there's an entry in the sqlite db for this
assert store.select_all_repos() == [(path, rev, ret)] assert _select_all_repos(store) == [(path, rev, ret)]
def test_warning_for_deprecated_stages_on_init(store, tempdir_factory, caplog): def test_warning_for_deprecated_stages_on_init(store, tempdir_factory, caplog):
@ -217,7 +228,7 @@ def test_clone_shallow_failure_fallback_to_complete(
assert git.head_rev(ret) == rev assert git.head_rev(ret) == rev
# Assert there's an entry in the sqlite db for this # Assert there's an entry in the sqlite db for this
assert store.select_all_repos() == [(path, rev, ret)] assert _select_all_repos(store) == [(path, rev, ret)]
def test_clone_tag_not_on_mainline(store, tempdir_factory): def test_clone_tag_not_on_mainline(store, tempdir_factory):
@ -265,7 +276,7 @@ def test_mark_config_as_used(store, tmpdir):
with tmpdir.as_cwd(): with tmpdir.as_cwd():
f = tmpdir.join('f').ensure() f = tmpdir.join('f').ensure()
store.mark_config_used('f') store.mark_config_used('f')
assert store.select_all_configs() == [f.strpath] assert _select_all_configs(store) == [f.strpath]
def test_mark_config_as_used_idempotent(store, tmpdir): def test_mark_config_as_used_idempotent(store, tmpdir):
@ -275,7 +286,7 @@ def test_mark_config_as_used_idempotent(store, tmpdir):
def test_mark_config_as_used_does_not_exist(store): def test_mark_config_as_used_does_not_exist(store):
store.mark_config_used('f') store.mark_config_used('f')
assert store.select_all_configs() == [] assert _select_all_configs(store) == []
def _simulate_pre_1_14_0(store): def _simulate_pre_1_14_0(store):
@ -283,9 +294,9 @@ def _simulate_pre_1_14_0(store):
db.executescript('DROP TABLE configs') db.executescript('DROP TABLE configs')
def test_select_all_configs_roll_forward(store): def test_gc_roll_forward(store):
_simulate_pre_1_14_0(store) _simulate_pre_1_14_0(store)
assert store.select_all_configs() == [] assert store.gc() == 0
def test_mark_config_as_used_roll_forward(store, tmpdir): def test_mark_config_as_used_roll_forward(store, tmpdir):
@ -314,7 +325,7 @@ def test_mark_config_as_used_readonly(tmpdir):
assert store.readonly assert store.readonly
# should be skipped due to readonly # should be skipped due to readonly
store.mark_config_used(str(cfg)) store.mark_config_used(str(cfg))
assert store.select_all_configs() == [] assert _select_all_configs(store) == []
def test_clone_with_recursive_submodules(store, tmp_path): def test_clone_with_recursive_submodules(store, tmp_path):