mirror of
https://github.com/pre-commit/pre-commit.git
synced 2026-02-17 08:14:42 +04:00
Speed up filename filtering.
Before there was a `getcwd` syscall for every filename which was filtered. Instead this is now cached per-run. - When all files are identified by filename only: ~45% improvement - When no files are identified by filename only: ~55% improvement This makes little difference to overall execution, the bigger win is eliminating the `memoize_by_cwd` hack. Just removing the memoization would have *increased* the runtime by 300-500%.
This commit is contained in:
parent
38308dc02d
commit
b1389603e0
6 changed files with 61 additions and 103 deletions
|
|
@ -17,14 +17,47 @@ from pre_commit.repository import all_hooks
|
||||||
from pre_commit.repository import install_hook_envs
|
from pre_commit.repository import install_hook_envs
|
||||||
from pre_commit.staged_files_only import staged_files_only
|
from pre_commit.staged_files_only import staged_files_only
|
||||||
from pre_commit.util import cmd_output
|
from pre_commit.util import cmd_output
|
||||||
from pre_commit.util import memoize_by_cwd
|
|
||||||
from pre_commit.util import noop_context
|
from pre_commit.util import noop_context
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger('pre_commit')
|
logger = logging.getLogger('pre_commit')
|
||||||
|
|
||||||
|
|
||||||
tags_from_path = memoize_by_cwd(tags_from_path)
|
def filter_by_include_exclude(names, include, exclude):
|
||||||
|
include_re, exclude_re = re.compile(include), re.compile(exclude)
|
||||||
|
return [
|
||||||
|
filename for filename in names
|
||||||
|
if include_re.search(filename)
|
||||||
|
if not exclude_re.search(filename)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class Classifier(object):
|
||||||
|
def __init__(self, filenames):
|
||||||
|
self.filenames = [f for f in filenames if os.path.lexists(f)]
|
||||||
|
self._types_cache = {}
|
||||||
|
|
||||||
|
def _types_for_file(self, filename):
|
||||||
|
try:
|
||||||
|
return self._types_cache[filename]
|
||||||
|
except KeyError:
|
||||||
|
ret = self._types_cache[filename] = tags_from_path(filename)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def by_types(self, names, types, exclude_types):
|
||||||
|
types, exclude_types = frozenset(types), frozenset(exclude_types)
|
||||||
|
ret = []
|
||||||
|
for filename in names:
|
||||||
|
tags = self._types_for_file(filename)
|
||||||
|
if tags >= types and not tags & exclude_types:
|
||||||
|
ret.append(filename)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def filenames_for_hook(self, hook):
|
||||||
|
names = self.filenames
|
||||||
|
names = filter_by_include_exclude(names, hook.files, hook.exclude)
|
||||||
|
names = self.by_types(names, hook.types, hook.exclude_types)
|
||||||
|
return names
|
||||||
|
|
||||||
|
|
||||||
def _get_skips(environ):
|
def _get_skips(environ):
|
||||||
|
|
@ -36,37 +69,12 @@ def _hook_msg_start(hook, verbose):
|
||||||
return '{}{}'.format('[{}] '.format(hook.id) if verbose else '', hook.name)
|
return '{}{}'.format('[{}] '.format(hook.id) if verbose else '', hook.name)
|
||||||
|
|
||||||
|
|
||||||
def _filter_by_include_exclude(filenames, include, exclude):
|
|
||||||
include_re, exclude_re = re.compile(include), re.compile(exclude)
|
|
||||||
return [
|
|
||||||
filename for filename in filenames
|
|
||||||
if (
|
|
||||||
include_re.search(filename) and
|
|
||||||
not exclude_re.search(filename) and
|
|
||||||
os.path.lexists(filename)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def _filter_by_types(filenames, types, exclude_types):
|
|
||||||
types, exclude_types = frozenset(types), frozenset(exclude_types)
|
|
||||||
ret = []
|
|
||||||
for filename in filenames:
|
|
||||||
tags = tags_from_path(filename)
|
|
||||||
if tags >= types and not tags & exclude_types:
|
|
||||||
ret.append(filename)
|
|
||||||
return tuple(ret)
|
|
||||||
|
|
||||||
|
|
||||||
SKIPPED = 'Skipped'
|
SKIPPED = 'Skipped'
|
||||||
NO_FILES = '(no files to check)'
|
NO_FILES = '(no files to check)'
|
||||||
|
|
||||||
|
|
||||||
def _run_single_hook(filenames, hook, args, skips, cols):
|
def _run_single_hook(classifier, hook, args, skips, cols):
|
||||||
include, exclude = hook.files, hook.exclude
|
filenames = classifier.filenames_for_hook(hook)
|
||||||
filenames = _filter_by_include_exclude(filenames, include, exclude)
|
|
||||||
types, exclude_types = hook.types, hook.exclude_types
|
|
||||||
filenames = _filter_by_types(filenames, types, exclude_types)
|
|
||||||
|
|
||||||
if hook.language == 'pcre':
|
if hook.language == 'pcre':
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
|
@ -193,10 +201,11 @@ def _run_hooks(config, hooks, args, environ):
|
||||||
skips = _get_skips(environ)
|
skips = _get_skips(environ)
|
||||||
cols = _compute_cols(hooks, args.verbose)
|
cols = _compute_cols(hooks, args.verbose)
|
||||||
filenames = _all_filenames(args)
|
filenames = _all_filenames(args)
|
||||||
filenames = _filter_by_include_exclude(filenames, '', config['exclude'])
|
filenames = filter_by_include_exclude(filenames, '', config['exclude'])
|
||||||
|
classifier = Classifier(filenames)
|
||||||
retval = 0
|
retval = 0
|
||||||
for hook in hooks:
|
for hook in hooks:
|
||||||
retval |= _run_single_hook(filenames, hook, args, skips, cols)
|
retval |= _run_single_hook(classifier, hook, args, skips, cols)
|
||||||
if retval and config['fail_fast']:
|
if retval and config['fail_fast']:
|
||||||
break
|
break
|
||||||
if retval and args.show_diff_on_failure and git.has_diff():
|
if retval and args.show_diff_on_failure and git.has_diff():
|
||||||
|
|
|
||||||
|
|
@ -3,24 +3,19 @@ import argparse
|
||||||
import pre_commit.constants as C
|
import pre_commit.constants as C
|
||||||
from pre_commit import git
|
from pre_commit import git
|
||||||
from pre_commit.clientlib import load_config
|
from pre_commit.clientlib import load_config
|
||||||
from pre_commit.commands.run import _filter_by_include_exclude
|
from pre_commit.commands.run import Classifier
|
||||||
from pre_commit.commands.run import _filter_by_types
|
|
||||||
from pre_commit.repository import all_hooks
|
from pre_commit.repository import all_hooks
|
||||||
from pre_commit.store import Store
|
from pre_commit.store import Store
|
||||||
|
|
||||||
|
|
||||||
def check_all_hooks_match_files(config_file):
|
def check_all_hooks_match_files(config_file):
|
||||||
files = git.get_all_files()
|
classifier = Classifier(git.get_all_files())
|
||||||
retv = 0
|
retv = 0
|
||||||
|
|
||||||
for hook in all_hooks(load_config(config_file), Store()):
|
for hook in all_hooks(load_config(config_file), Store()):
|
||||||
if hook.always_run or hook.language == 'fail':
|
if hook.always_run or hook.language == 'fail':
|
||||||
continue
|
continue
|
||||||
include, exclude = hook.files, hook.exclude
|
elif not classifier.filenames_for_hook(hook):
|
||||||
filtered = _filter_by_include_exclude(files, include, exclude)
|
|
||||||
types, exclude_types = hook.types, hook.exclude_types
|
|
||||||
filtered = _filter_by_types(filtered, types, exclude_types)
|
|
||||||
if not filtered:
|
|
||||||
print('{} does not apply to this repository'.format(hook.id))
|
print('{} does not apply to this repository'.format(hook.id))
|
||||||
retv = 1
|
retv = 1
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ import pre_commit.constants as C
|
||||||
from pre_commit import git
|
from pre_commit import git
|
||||||
from pre_commit.clientlib import load_config
|
from pre_commit.clientlib import load_config
|
||||||
from pre_commit.clientlib import MANIFEST_HOOK_DICT
|
from pre_commit.clientlib import MANIFEST_HOOK_DICT
|
||||||
from pre_commit.commands.run import _filter_by_types
|
from pre_commit.commands.run import Classifier
|
||||||
|
|
||||||
|
|
||||||
def exclude_matches_any(filenames, include, exclude):
|
def exclude_matches_any(filenames, include, exclude):
|
||||||
|
|
@ -24,11 +24,11 @@ def exclude_matches_any(filenames, include, exclude):
|
||||||
|
|
||||||
def check_useless_excludes(config_file):
|
def check_useless_excludes(config_file):
|
||||||
config = load_config(config_file)
|
config = load_config(config_file)
|
||||||
files = git.get_all_files()
|
classifier = Classifier(git.get_all_files())
|
||||||
retv = 0
|
retv = 0
|
||||||
|
|
||||||
exclude = config['exclude']
|
exclude = config['exclude']
|
||||||
if not exclude_matches_any(files, '', exclude):
|
if not exclude_matches_any(classifier.filenames, '', exclude):
|
||||||
print(
|
print(
|
||||||
'The global exclude pattern {!r} does not match any files'
|
'The global exclude pattern {!r} does not match any files'
|
||||||
.format(exclude),
|
.format(exclude),
|
||||||
|
|
@ -40,10 +40,11 @@ def check_useless_excludes(config_file):
|
||||||
# Not actually a manifest dict, but this more accurately reflects
|
# Not actually a manifest dict, but this more accurately reflects
|
||||||
# the defaults applied during runtime
|
# the defaults applied during runtime
|
||||||
hook = apply_defaults(hook, MANIFEST_HOOK_DICT)
|
hook = apply_defaults(hook, MANIFEST_HOOK_DICT)
|
||||||
|
names = classifier.filenames
|
||||||
types, exclude_types = hook['types'], hook['exclude_types']
|
types, exclude_types = hook['types'], hook['exclude_types']
|
||||||
filtered_by_types = _filter_by_types(files, types, exclude_types)
|
names = classifier.by_types(names, types, exclude_types)
|
||||||
include, exclude = hook['files'], hook['exclude']
|
include, exclude = hook['files'], hook['exclude']
|
||||||
if not exclude_matches_any(filtered_by_types, include, exclude):
|
if not exclude_matches_any(names, include, exclude):
|
||||||
print(
|
print(
|
||||||
'The exclude pattern {!r} for {} does not match any files'
|
'The exclude pattern {!r} for {} does not match any files'
|
||||||
.format(exclude, hook['id']),
|
.format(exclude, hook['id']),
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import contextlib
|
import contextlib
|
||||||
import errno
|
import errno
|
||||||
import functools
|
|
||||||
import os.path
|
import os.path
|
||||||
import shutil
|
import shutil
|
||||||
import stat
|
import stat
|
||||||
|
|
@ -31,23 +30,6 @@ def mkdirp(path):
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
def memoize_by_cwd(func):
|
|
||||||
"""Memoize a function call based on os.getcwd()."""
|
|
||||||
@functools.wraps(func)
|
|
||||||
def wrapper(*args):
|
|
||||||
cwd = os.getcwd()
|
|
||||||
key = (cwd,) + args
|
|
||||||
try:
|
|
||||||
return wrapper._cache[key]
|
|
||||||
except KeyError:
|
|
||||||
ret = wrapper._cache[key] = func(*args)
|
|
||||||
return ret
|
|
||||||
|
|
||||||
wrapper._cache = {}
|
|
||||||
|
|
||||||
return wrapper
|
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def clean_path_on_failure(path):
|
def clean_path_on_failure(path):
|
||||||
"""Cleans up the directory on an exceptional failure."""
|
"""Cleans up the directory on an exceptional failure."""
|
||||||
|
|
|
||||||
|
|
@ -11,9 +11,10 @@ import pytest
|
||||||
import pre_commit.constants as C
|
import pre_commit.constants as C
|
||||||
from pre_commit.commands.install_uninstall import install
|
from pre_commit.commands.install_uninstall import install
|
||||||
from pre_commit.commands.run import _compute_cols
|
from pre_commit.commands.run import _compute_cols
|
||||||
from pre_commit.commands.run import _filter_by_include_exclude
|
|
||||||
from pre_commit.commands.run import _get_skips
|
from pre_commit.commands.run import _get_skips
|
||||||
from pre_commit.commands.run import _has_unmerged_paths
|
from pre_commit.commands.run import _has_unmerged_paths
|
||||||
|
from pre_commit.commands.run import Classifier
|
||||||
|
from pre_commit.commands.run import filter_by_include_exclude
|
||||||
from pre_commit.commands.run import run
|
from pre_commit.commands.run import run
|
||||||
from pre_commit.util import cmd_output
|
from pre_commit.util import cmd_output
|
||||||
from pre_commit.util import make_executable
|
from pre_commit.util import make_executable
|
||||||
|
|
@ -748,18 +749,22 @@ def test_fail_fast(cap_out, store, repo_with_failing_hook):
|
||||||
assert printed.count(b'Failing hook') == 1
|
assert printed.count(b'Failing hook') == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_classifier_removes_dne():
|
||||||
|
classifier = Classifier(('this_file_does_not_exist',))
|
||||||
|
assert classifier.filenames == []
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def some_filenames():
|
def some_filenames():
|
||||||
return (
|
return (
|
||||||
'.pre-commit-hooks.yaml',
|
'.pre-commit-hooks.yaml',
|
||||||
'im_a_file_that_doesnt_exist.py',
|
|
||||||
'pre_commit/git.py',
|
'pre_commit/git.py',
|
||||||
'pre_commit/main.py',
|
'pre_commit/main.py',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_include_exclude_base_case(some_filenames):
|
def test_include_exclude_base_case(some_filenames):
|
||||||
ret = _filter_by_include_exclude(some_filenames, '', '^$')
|
ret = filter_by_include_exclude(some_filenames, '', '^$')
|
||||||
assert ret == [
|
assert ret == [
|
||||||
'.pre-commit-hooks.yaml',
|
'.pre-commit-hooks.yaml',
|
||||||
'pre_commit/git.py',
|
'pre_commit/git.py',
|
||||||
|
|
@ -771,22 +776,22 @@ def test_include_exclude_base_case(some_filenames):
|
||||||
def test_matches_broken_symlink(tmpdir):
|
def test_matches_broken_symlink(tmpdir):
|
||||||
with tmpdir.as_cwd():
|
with tmpdir.as_cwd():
|
||||||
os.symlink('does-not-exist', 'link')
|
os.symlink('does-not-exist', 'link')
|
||||||
ret = _filter_by_include_exclude({'link'}, '', '^$')
|
ret = filter_by_include_exclude({'link'}, '', '^$')
|
||||||
assert ret == ['link']
|
assert ret == ['link']
|
||||||
|
|
||||||
|
|
||||||
def test_include_exclude_total_match(some_filenames):
|
def test_include_exclude_total_match(some_filenames):
|
||||||
ret = _filter_by_include_exclude(some_filenames, r'^.*\.py$', '^$')
|
ret = filter_by_include_exclude(some_filenames, r'^.*\.py$', '^$')
|
||||||
assert ret == ['pre_commit/git.py', 'pre_commit/main.py']
|
assert ret == ['pre_commit/git.py', 'pre_commit/main.py']
|
||||||
|
|
||||||
|
|
||||||
def test_include_exclude_does_search_instead_of_match(some_filenames):
|
def test_include_exclude_does_search_instead_of_match(some_filenames):
|
||||||
ret = _filter_by_include_exclude(some_filenames, r'\.yaml$', '^$')
|
ret = filter_by_include_exclude(some_filenames, r'\.yaml$', '^$')
|
||||||
assert ret == ['.pre-commit-hooks.yaml']
|
assert ret == ['.pre-commit-hooks.yaml']
|
||||||
|
|
||||||
|
|
||||||
def test_include_exclude_exclude_removes_files(some_filenames):
|
def test_include_exclude_exclude_removes_files(some_filenames):
|
||||||
ret = _filter_by_include_exclude(some_filenames, '', r'\.py$')
|
ret = filter_by_include_exclude(some_filenames, '', r'\.py$')
|
||||||
assert ret == ['.pre-commit-hooks.yaml']
|
assert ret == ['.pre-commit-hooks.yaml']
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,14 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import os.path
|
import os.path
|
||||||
import random
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from pre_commit.util import CalledProcessError
|
from pre_commit.util import CalledProcessError
|
||||||
from pre_commit.util import clean_path_on_failure
|
from pre_commit.util import clean_path_on_failure
|
||||||
from pre_commit.util import cmd_output
|
from pre_commit.util import cmd_output
|
||||||
from pre_commit.util import memoize_by_cwd
|
|
||||||
from pre_commit.util import parse_version
|
from pre_commit.util import parse_version
|
||||||
from pre_commit.util import tmpdir
|
from pre_commit.util import tmpdir
|
||||||
from testing.util import cwd
|
|
||||||
|
|
||||||
|
|
||||||
def test_CalledProcessError_str():
|
def test_CalledProcessError_str():
|
||||||
|
|
@ -42,37 +39,6 @@ def test_CalledProcessError_str_nooutput():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def memoized_by_cwd():
|
|
||||||
@memoize_by_cwd
|
|
||||||
def func(arg):
|
|
||||||
return arg + str(random.getrandbits(64))
|
|
||||||
|
|
||||||
return func
|
|
||||||
|
|
||||||
|
|
||||||
def test_memoized_by_cwd_returns_same_twice_in_a_row(memoized_by_cwd):
|
|
||||||
ret = memoized_by_cwd('baz')
|
|
||||||
ret2 = memoized_by_cwd('baz')
|
|
||||||
assert ret is ret2
|
|
||||||
|
|
||||||
|
|
||||||
def test_memoized_by_cwd_returns_different_for_different_args(memoized_by_cwd):
|
|
||||||
ret = memoized_by_cwd('baz')
|
|
||||||
ret2 = memoized_by_cwd('bar')
|
|
||||||
assert ret.startswith('baz')
|
|
||||||
assert ret2.startswith('bar')
|
|
||||||
assert ret != ret2
|
|
||||||
|
|
||||||
|
|
||||||
def test_memoized_by_cwd_changes_with_different_cwd(memoized_by_cwd):
|
|
||||||
ret = memoized_by_cwd('baz')
|
|
||||||
with cwd('.git'):
|
|
||||||
ret2 = memoized_by_cwd('baz')
|
|
||||||
|
|
||||||
assert ret != ret2
|
|
||||||
|
|
||||||
|
|
||||||
def test_clean_on_failure_noop(in_tmpdir):
|
def test_clean_on_failure_noop(in_tmpdir):
|
||||||
with clean_path_on_failure('foo'):
|
with clean_path_on_failure('foo'):
|
||||||
pass
|
pass
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue