diff --git a/pre_commit/clientlib.py b/pre_commit/clientlib.py index f2582020..99ebfebe 100644 --- a/pre_commit/clientlib.py +++ b/pre_commit/clientlib.py @@ -315,6 +315,16 @@ load_manifest = functools.partial( ) +def load_manifest_contents(repo: str, contents: str) -> dict[str, Any]: + with ( + cfgv.reraise_as(InvalidManifestError), + cfgv.validate_context(f'File ({repo})/{C.MANIFEST_FILE}'), + ): + obj = _load_manifest_backward_compat(contents) + cfgv.validate(obj, MANIFEST_SCHEMA) + return cfgv.apply_defaults(obj, MANIFEST_SCHEMA) + + LOCAL = 'local' META = 'meta' @@ -577,3 +587,18 @@ load_config = functools.partial( load_strategy=yaml_load, exc_tp=InvalidConfigError, ) + + +class _AnySchema: + def check(self, v: object) -> None: + pass + + def apply_defaults(self, v: object) -> object: + return v + + +load_raw = functools.partial( + cfgv.load_from_filename, + schema=_AnySchema(), + load_strategy=yaml_load, +) diff --git a/pre_commit/commands/gc.py b/pre_commit/commands/gc.py index 7ebf4a1e..38f5635f 100644 --- a/pre_commit/commands/gc.py +++ b/pre_commit/commands/gc.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import os.path from typing import Any @@ -15,18 +16,21 @@ from pre_commit.store import Store from pre_commit.util import rmtree -def _mark_used_repos( - store: Store, - all_repos: dict[tuple[str, str], str], - unused_repos: set[tuple[str, str]], +def _mark_used( + config: dict[str, Any], repo: dict[str, Any], + manifests: dict[tuple[str, str], dict[str, Any]], + unused_manifests: set[tuple[str, str]], + unused_installs: set[str], ) -> None: if repo['repo'] == META: return elif repo['repo'] == LOCAL: for hook in repo['hooks']: - deps = hook.get('additional_dependencies') - unused_repos.discard(( + deps = hook['additional_dependencies'] + unused_installs.discard(( + repo['repo'], C.LOCAL_REPO_VERSION, + repo['language'], store.db_repo_name(repo['repo'], deps), C.LOCAL_REPO_VERSION, )) @@ -60,11 +64,16 @@ def _mark_used_repos( def _gc(store: Store) -> int: with store.exclusive_lock(), store.connect() as db: - store._create_configs_table(db) + installs_rows = db.execute('SELECT key, path FROM installs').fetchall() + all_installs = dict(installs_rows) + unused_installs = set(all_installs) - repos = db.execute('SELECT repo, ref, path FROM repos').fetchall() - all_repos = {(repo, ref): path for repo, ref, path in repos} - unused_repos = set(all_repos) + manifests_query = 'SELECT repo, rev, manifest FROM manifests' + manifests = { + (repo, rev): json.loads(manifest) + for repo, rev, manifest in db.execute(manifests_query).fetchall() + } + unused_manifests = set(manifests) configs_rows = db.execute('SELECT path FROM configs').fetchall() configs = [path for path, in configs_rows] @@ -78,7 +87,13 @@ def _gc(store: Store) -> int: continue else: for repo in config['repos']: - _mark_used_repos(store, all_repos, unused_repos, repo) + _mark_used( + config, + repo, + manifests, + unused_manifests, + unused_installs, + ) paths = [(path,) for path in dead_configs] db.executemany('DELETE FROM configs WHERE path = ?', paths) @@ -94,5 +109,7 @@ def _gc(store: Store) -> int: def gc(store: Store) -> int: - output.write_line(f'{_gc(store)} repo(s) removed.') + installs, clones = _gc(store) + output.write_line(f'{clones} clone(s) removed.') + output.write_line(f'{installs} installs(s) removed.') return 0 diff --git a/pre_commit/hook.py b/pre_commit/hook.py index 309cd5be..976bb3d3 100644 --- a/pre_commit/hook.py +++ b/pre_commit/hook.py @@ -1,18 +1,28 @@ from __future__ import annotations +import json import logging from collections.abc import Sequence from typing import Any from typing import NamedTuple -from pre_commit.prefix import Prefix - logger = logging.getLogger('pre_commit') +class InstallKey(NamedTuple): + repo: str + rev: str + language: str + language_version: str + additional_dependencies: tuple[str, ...] + + def keystr(self) -> str: + return json.dumps(self, separators=(',', ':')) + + class Hook(NamedTuple): - src: str - prefix: Prefix + repo: str + rev: str id: str name: str entry: str @@ -37,24 +47,25 @@ class Hook(NamedTuple): verbose: bool @property - def install_key(self) -> tuple[Prefix, str, str, tuple[str, ...]]: - return ( - self.prefix, - self.language, - self.language_version, - tuple(self.additional_dependencies), + def install_key(self) -> InstallKey: + return InstallKey( + repo=self.repo, + rev=self.rev, + language=self.language, + language_version=self.language_version, + additional_dependencies=tuple(self.additional_dependencies), ) @classmethod - def create(cls, src: str, prefix: Prefix, dct: dict[str, Any]) -> Hook: + def create(cls, repo: str, rev: str, dct: dict[str, Any]) -> Hook: # TODO: have cfgv do this (?) extra_keys = set(dct) - _KEYS if extra_keys: logger.warning( - f'Unexpected key(s) present on {src} => {dct["id"]}: ' + f'Unexpected key(s) present on {repo} => {dct["id"]}: ' f'{", ".join(sorted(extra_keys))}', ) - return cls(src=src, prefix=prefix, **{k: dct[k] for k in _KEYS}) + return cls(repo=repo, rev=rev, **{k: dct[k] for k in _KEYS}) -_KEYS = frozenset(set(Hook._fields) - {'src', 'prefix'}) +_KEYS = frozenset(set(Hook._fields) - {'repo', 'rev'}) diff --git a/pre_commit/lang_base.py b/pre_commit/lang_base.py index 198e9365..b0f40faf 100644 --- a/pre_commit/lang_base.py +++ b/pre_commit/lang_base.py @@ -25,9 +25,6 @@ SHIMS_RE = re.compile(r'[/\\]shims[/\\]') class Language(Protocol): - # Use `None` for no installation / environment - @property - def ENVIRONMENT_DIR(self) -> str | None: ... # return a value to replace `'default` for `language_version` def get_default_version(self) -> str: ... # return whether the environment is healthy (or should be rebuilt) @@ -36,7 +33,8 @@ class Language(Protocol): # install a repository for the given language and language_version def install_environment( self, - prefix: Prefix, + clone: Prefix, + dest: Prefix, version: str, additional_dependencies: Sequence[str], ) -> None: @@ -87,10 +85,6 @@ def setup_cmd(prefix: Prefix, cmd: tuple[str, ...], **kwargs: Any) -> None: cmd_output_b(*cmd, cwd=prefix.prefix_dir, **kwargs) -def environment_dir(prefix: Prefix, d: str, language_version: str) -> str: - return prefix.path(f'{d}-{language_version}') - - def assert_version_default(binary: str, version: str) -> None: if version != C.DEFAULT: raise AssertionError( diff --git a/pre_commit/repository.py b/pre_commit/repository.py index fd58b05e..6edeedf0 100644 --- a/pre_commit/repository.py +++ b/pre_commit/repository.py @@ -1,91 +1,47 @@ from __future__ import annotations -import json import logging import os +import tempfile from collections.abc import Sequence from typing import Any import pre_commit.constants as C +from pre_commit import lang_base from pre_commit.all_languages import languages -from pre_commit.clientlib import load_manifest from pre_commit.clientlib import LOCAL from pre_commit.clientlib import META from pre_commit.hook import Hook -from pre_commit.lang_base import environment_dir +from pre_commit.hook import InstallKey from pre_commit.prefix import Prefix from pre_commit.store import Store from pre_commit.util import clean_path_on_failure -from pre_commit.util import rmtree logger = logging.getLogger('pre_commit') -def _state_filename_v1(venv: str) -> str: - return os.path.join(venv, '.install_state_v1') +def _state_filename_v5(d: str) -> str: + return os.path.join(d, '.pre-commit-state-v5') -def _state_filename_v2(venv: str) -> str: - return os.path.join(venv, '.install_state_v2') - - -def _state(additional_deps: Sequence[str]) -> object: - return {'additional_dependencies': additional_deps} - - -def _read_state(venv: str) -> object | None: - filename = _state_filename_v1(venv) - if not os.path.exists(filename): - return None - else: - with open(filename) as f: - return json.load(f) - - -def _hook_installed(hook: Hook) -> bool: - lang = languages[hook.language] - if lang.ENVIRONMENT_DIR is None: - return True - - venv = environment_dir( - hook.prefix, - lang.ENVIRONMENT_DIR, - hook.language_version, - ) - return ( - ( - os.path.exists(_state_filename_v2(venv)) or - _read_state(venv) == _state(hook.additional_dependencies) - ) and - not lang.health_check(hook.prefix, hook.language_version) - ) - - -def _hook_install(hook: Hook) -> None: - logger.info(f'Installing environment for {hook.src}.') +def _hook_install(hook: Hook, store: Store) -> None: + logger.info(f'Installing environment for {hook.repo}.') logger.info('Once installed this environment will be reused.') logger.info('This may take a few minutes...') lang = languages[hook.language] - assert lang.ENVIRONMENT_DIR is not None - venv = environment_dir( - hook.prefix, - lang.ENVIRONMENT_DIR, - hook.language_version, - ) + clone = store.clone(hook.repo, hook.rev) + dest = tempfile.mkdtemp(prefix='i-', dir=store.directory) - # There's potentially incomplete cleanup from previous runs - # Clean it up! - if os.path.exists(venv): - rmtree(venv) - - with clean_path_on_failure(venv): + with clean_path_on_failure(dest): + prefix = Prefix(dest) lang.install_environment( - hook.prefix, hook.language_version, hook.additional_dependencies, + Prefix(clone), prefix, + hook.language_version, hook.additional_dependencies, ) - health_error = lang.health_check(hook.prefix, hook.language_version) + health_error = lang.health_check(prefix, hook.language_version) if health_error: raise AssertionError( f'BUG: expected environment for {hook.language} to be healthy ' @@ -94,16 +50,12 @@ def _hook_install(hook: Hook) -> None: f'more info:\n\n{health_error}', ) - # TODO: remove v1 state writing, no longer needed after pre-commit 3.0 - # Write our state to indicate we're installed - state_filename = _state_filename_v1(venv) - staging = f'{state_filename}staging' - with open(staging, 'w') as state_file: - state_file.write(json.dumps(_state(hook.additional_dependencies))) - # Move the file into place atomically to indicate we've installed - os.replace(staging, state_filename) + # TODO: need more info? + open(_state_filename_v5(dest), 'a+').close() - open(_state_filename_v2(venv), 'a+').close() + +def _hook_installed(hook: Hook, store: Store) -> bool: + return False def _hook( @@ -123,7 +75,7 @@ def _hook( if not ret['stages']: ret['stages'] = root_config['default_stages'] - if languages[lang].ENVIRONMENT_DIR is None: + if languages[lang].install_environment is lang_base.no_install: if ret['language_version'] != C.DEFAULT: logger.error( f'The hook `{ret["id"]}` specifies `language_version` but is ' @@ -131,7 +83,7 @@ def _hook( f'environment. ' f'Perhaps you meant to use a specific language?', ) - exit(1) + raise SystemExit(1) if ret['additional_dependencies']: logger.error( f'The hook `{ret["id"]}` specifies `additional_dependencies` ' @@ -139,86 +91,64 @@ def _hook( f'environment. ' f'Perhaps you meant to use a specific language?', ) - exit(1) + raise SystemExit(1) return ret -def _non_cloned_repository_hooks( - repo_config: dict[str, Any], - store: Store, - root_config: dict[str, Any], -) -> tuple[Hook, ...]: - def _prefix(language_name: str, deps: Sequence[str]) -> Prefix: - language = languages[language_name] - # pygrep / script / system / docker_image do not have - # environments so they work out of the current directory - if language.ENVIRONMENT_DIR is None: - return Prefix(os.getcwd()) - else: - return Prefix(store.make_local(deps)) - - return tuple( - Hook.create( - repo_config['repo'], - _prefix(hook['language'], hook['additional_dependencies']), - _hook(hook, root_config=root_config), - ) - for hook in repo_config['hooks'] - ) - - -def _cloned_repository_hooks( - repo_config: dict[str, Any], - store: Store, - root_config: dict[str, Any], -) -> tuple[Hook, ...]: - repo, rev = repo_config['repo'], repo_config['rev'] - manifest_path = os.path.join(store.clone(repo, rev), C.MANIFEST_FILE) - by_id = load_manifest(manifest_path)['hooks'] - - for hook in repo_config['hooks']: - if hook['id'] not in by_id: - logger.error( - f'`{hook["id"]}` is not present in repository {repo}. ' - f'Typo? Perhaps it is introduced in a newer version? ' - f'Often `pre-commit autoupdate` fixes this.', - ) - exit(1) - - hook_dcts = [ - _hook(by_id[hook['id']], hook, root_config=root_config) - for hook in repo_config['hooks'] - ] - return tuple( - Hook.create( - repo_config['repo'], - Prefix(store.clone(repo, rev, hook['additional_dependencies'])), - hook, - ) - for hook in hook_dcts - ) - - def _repository_hooks( repo_config: dict[str, Any], store: Store, root_config: dict[str, Any], ) -> tuple[Hook, ...]: - if repo_config['repo'] in {LOCAL, META}: - return _non_cloned_repository_hooks(repo_config, store, root_config) + repo = repo_config['repo'] + if repo == META: + return tuple( + Hook.create( + repo, '', + _hook(hook, root_config=root_config), + ) + for hook in repo_config['hooks'] + ) + elif repo == LOCAL: + return tuple( + Hook.create( + repo, C.LOCAL_REPO_VERSION, + _hook(hook, root_config=root_config), + ) + for hook in repo_config['hooks'] + ) else: - return _cloned_repository_hooks(repo_config, store, root_config) + rev = repo_config['rev'] + by_id = store.manifest(repo, rev)['hooks'] + + for hook in repo_config['hooks']: + if hook['id'] not in by_id: + logger.error( + f'`{hook["id"]}` is not present in repository {repo}. ' + f'Typo? Perhaps it is introduced in a newer version? ' + f'Often `pre-commit autoupdate` fixes this.', + ) + raise SystemExit(1) + + return tuple( + Hook.create( + repo, rev, + _hook(by_id[hook['id']], hook, root_config=root_config), + ) + for hook in repo_config['hooks'] + ) def install_hook_envs(hooks: Sequence[Hook], store: Store) -> None: def _need_installed() -> list[Hook]: - seen: set[tuple[Prefix, str, str, tuple[str, ...]]] = set() + seen: set[InstallKey] = set() ret = [] for hook in hooks: - if hook.install_key not in seen and not _hook_installed(hook): + key = hook.install_key + if key not in seen and not _hook_installed(hook, store): ret.append(hook) - seen.add(hook.install_key) + seen.add(key) return ret if not _need_installed(): @@ -226,7 +156,7 @@ def install_hook_envs(hooks: Sequence[Hook], store: Store) -> None: with store.exclusive_lock(): # Another process may have already completed this work for hook in _need_installed(): - _hook_install(hook) + _hook_install(hook, store) def all_hooks(root_config: dict[str, Any], store: Store) -> tuple[Hook, ...]: diff --git a/pre_commit/store.py b/pre_commit/store.py index dc90c051..a6d80d04 100644 --- a/pre_commit/store.py +++ b/pre_commit/store.py @@ -1,13 +1,14 @@ from __future__ import annotations import contextlib +import json import logging import os.path import sqlite3 import tempfile from collections.abc import Callable from collections.abc import Generator -from collections.abc import Sequence +from typing import Any import pre_commit.constants as C from pre_commit import clientlib @@ -61,7 +62,7 @@ class Store: def __init__(self, directory: str | None = None) -> None: self.directory = directory or Store.get_default_directory() - self.db_path = os.path.join(self.directory, 'db.db') + self.db_path = os.path.join(self.directory, 'db5.db') self.readonly = ( os.path.exists(self.directory) and not os.access(self.directory, os.W_OK) @@ -82,20 +83,40 @@ class Store: if os.path.exists(self.db_path): # pragma: no cover (race) return # To avoid a race where someone ^Cs between db creation and - # execution of the CREATE TABLE statement + # execution of the CREATE TABLE statements fd, tmpfile = tempfile.mkstemp(dir=self.directory) # We'll be managing this file ourselves os.close(fd) with self.connect(db_path=tmpfile) as db: db.executescript( - 'CREATE TABLE repos (' - ' repo TEXT NOT NULL,' - ' ref TEXT NOT NULL,' - ' path TEXT NOT NULL,' - ' PRIMARY KEY (repo, ref)' + 'CREATE TABLE configs (' + ' path TEXT NOT NULL,' + ' PRIMARY KEY (path)' + ');', + ) + db.executescript( + 'CREATE TABLE manifests (' + ' repo TEXT NOT NULL,' + ' rev TEXT NOT NULL,' + ' manifest TEXT NOT NULL,' + ' PRIMARY KEY (repo, rev)' + ');', + ) + db.executescript( + 'CREATE TABLE clones (' + ' repo TEXT NOT NULL,' + ' rev TEXT NOT NULL,' + ' path TEXT NOT NULL,' + ' PRIMARY KEY (repo, rev)' + ');', + ) + db.executescript( + 'CREATE TABLE installs (' + ' key TEXT NOT NULL,' + ' path TEXT NOT NULL,' + ' PRIMARY KEY (key)' ');', ) - self._create_configs_table(db) # Atomic file move os.replace(tmpfile, self.db_path) @@ -122,29 +143,30 @@ class Store: with db: yield db - @classmethod - def db_repo_name(cls, repo: str, deps: Sequence[str]) -> str: - if deps: - return f'{repo}:{",".join(deps)}' - else: - return repo + def _complete_clone(self, rev: str, git_cmd: Callable[..., None]) -> None: + """Perform a complete clone of a repository and its submodules """ - def _new_repo( - self, - repo: str, - ref: str, - deps: Sequence[str], - make_strategy: Callable[[str], None], - ) -> str: - original_repo = repo - repo = self.db_repo_name(repo, deps) + git_cmd('fetch', 'origin', '--tags') + git_cmd('checkout', rev) + git_cmd('submodule', 'update', '--init', '--recursive') + def _shallow_clone(self, rev: str, git_cmd: Callable[..., None]) -> None: + """Perform a shallow clone of a repository and its submodules """ + + v2 = ('-c', 'protocol.version=2') + git_cmd(*v2, 'fetch', 'origin', rev, '--depth=1') + git_cmd('checkout', 'FETCH_HEAD') + git_cmd( + *v2, 'submodule', 'update', '--init', '--recursive', + '--depth=1', + ) + + def clone(self, repo: str, rev: str) -> str: def _get_result() -> str | None: - # Check if we already exist with self.connect() as db: result = db.execute( - 'SELECT path FROM repos WHERE repo = ? AND ref = ?', - (repo, ref), + 'SELECT path FROM clones WHERE repo = ? AND rev = ?', + (repo, rev), ).fetchone() return result[0] if result else None @@ -157,70 +179,64 @@ class Store: if result: # pragma: no cover (race) return result - logger.info(f'Initializing environment for {repo}.') + logger.info(f'Cloning {repo}...') - directory = tempfile.mkdtemp(prefix='repo', dir=self.directory) + directory = tempfile.mkdtemp(prefix='clone', dir=self.directory) with clean_path_on_failure(directory): - make_strategy(directory) + git.init_repo(directory, repo) + env = git.no_git_env() + + def _git_cmd(*args: str) -> None: + cmd_output_b('git', *args, cwd=directory, env=env) + + try: + self._shallow_clone(rev, _git_cmd) + except CalledProcessError: + self._complete_clone(rev, _git_cmd) + + manifest = clientlib.load_raw( + os.path.join(directory, C.MANIFEST_FILE), + display_filename=f'({repo})/{C.MANIFEST_FILE}', + exc_tp=clientlib.InvalidManifestError, + ) # Update our db with the created repo with self.connect() as db: db.execute( - 'INSERT INTO repos (repo, ref, path) VALUES (?, ?, ?)', - [repo, ref, directory], + 'INSERT INTO clones VALUES (?, ?, ?)', + (repo, rev, directory), + ) + db.execute( + 'INSERT INTO manifests VALUES (?, ?, ?)', + (repo, rev, json.dumps(manifest)), ) - clientlib.warn_for_stages_on_repo_init(original_repo, directory) + clientlib.warn_for_stages_on_repo_init(repo, directory) return directory - def _complete_clone(self, ref: str, git_cmd: Callable[..., None]) -> None: - """Perform a complete clone of a repository and its submodules """ + def maybe_manifest( + self, + repo: str, + rev: str, + ) -> dict[str, dict[str, Any]] | None: + with self.connect() as db: + result = db.execute( + 'SELECT manifest FROM manifests WHERE repo = ? AND rev = ?', + (repo, rev), + ).fetchone() + if result is not None: + return clientlib.load_manifest_contents(repo, result[0]) + else: + return None - git_cmd('fetch', 'origin', '--tags') - git_cmd('checkout', ref) - git_cmd('submodule', 'update', '--init', '--recursive') - - def _shallow_clone(self, ref: str, git_cmd: Callable[..., None]) -> None: - """Perform a shallow clone of a repository and its submodules """ - - git_config = 'protocol.version=2' - git_cmd('-c', git_config, 'fetch', 'origin', ref, '--depth=1') - git_cmd('checkout', 'FETCH_HEAD') - git_cmd( - '-c', git_config, 'submodule', 'update', '--init', '--recursive', - '--depth=1', - ) - - def clone(self, repo: str, ref: str, deps: Sequence[str] = ()) -> str: - """Clone the given url and checkout the specific ref.""" - - def clone_strategy(directory: str) -> None: - git.init_repo(directory, repo) - env = git.no_git_env() - - def _git_cmd(*args: str) -> None: - cmd_output_b('git', *args, cwd=directory, env=env) - - try: - self._shallow_clone(ref, _git_cmd) - except CalledProcessError: - self._complete_clone(ref, _git_cmd) - - return self._new_repo(repo, ref, deps, clone_strategy) - - def make_local(self, deps: Sequence[str]) -> str: - return self._new_repo( - 'local', C.LOCAL_REPO_VERSION, deps, _make_local_repo, - ) - - def _create_configs_table(self, db: sqlite3.Connection) -> None: - db.executescript( - 'CREATE TABLE IF NOT EXISTS configs (' - ' path TEXT NOT NULL,' - ' PRIMARY KEY (path)' - ');', - ) + def manifest(self, repo: str, rev: str) -> dict[str, dict[str, Any]]: + ret = self.maybe_manifest(repo, rev) + if ret is None: + self.clone(repo, rev) + ret = self.maybe_manifest(repo, rev) + assert ret is not None + return ret def mark_config_used(self, path: str) -> None: if self.readonly: # pragma: win32 no cover @@ -230,6 +246,4 @@ class Store: if not os.path.exists(path): return with self.connect() as db: - # TODO: eventually remove this and only create in _create - self._create_configs_table(db) db.execute('INSERT OR IGNORE INTO configs VALUES (?)', (path,))