Speed up filename filtering.

Before there was a `getcwd` syscall for every filename which was filtered. Instead this is now cached per-run. - When all files are identified by filename only: ~45% improvement - When no files are identified by filename only: ~55% improvement This makes little difference to overall execution, the bigger win is eliminating the `memoize_by_cwd` hack. Just removing the memoization would have *increased* the runtime by 300-500%.
2026-04-15 01:51:46 +04:00 · 2019-01-23 20:42:27 -08:00 · 2019-01-23 20:42:27 -08:00 · b1389603e0
commit b1389603e0
parent 38308dc02d
6 changed files with 61 additions and 103 deletions
--- a/pre_commit/commands/run.py
+++ b/pre_commit/commands/run.py
@ -17,14 +17,47 @@ from pre_commit.repository import all_hooks
 from pre_commit.repository import install_hook_envs
 from pre_commit.staged_files_only import staged_files_only
 from pre_commit.util import cmd_output
 from pre_commit.util import memoize_by_cwd
 from pre_commit.util import noop_context
 logger = logging.getLogger('pre_commit')
-tags_from_path = memoize_by_cwd(tags_from_path)
+def filter_by_include_exclude(names, include, exclude):
    include_re, exclude_re = re.compile(include), re.compile(exclude)
    return [
        filename for filename in names
        if include_re.search(filename)
        if not exclude_re.search(filename)
    ]
 class Classifier(object):
    def __init__(self, filenames):
        self.filenames = [f for f in filenames if os.path.lexists(f)]
        self._types_cache = {}
    def _types_for_file(self, filename):
        try:
            return self._types_cache[filename]
        except KeyError:
            ret = self._types_cache[filename] = tags_from_path(filename)
            return ret
    def by_types(self, names, types, exclude_types):
        types, exclude_types = frozenset(types), frozenset(exclude_types)
        ret = []
        for filename in names:
            tags = self._types_for_file(filename)
            if tags >= types and not tags & exclude_types:
                ret.append(filename)
        return ret
    def filenames_for_hook(self, hook):
        names = self.filenames
        names = filter_by_include_exclude(names, hook.files, hook.exclude)
        names = self.by_types(names, hook.types, hook.exclude_types)
        return names
 def _get_skips(environ):
@ -36,37 +69,12 @@ def _hook_msg_start(hook, verbose):
    return '{}{}'.format('[{}] '.format(hook.id) if verbose else '', hook.name)
 def _filter_by_include_exclude(filenames, include, exclude):
    include_re, exclude_re = re.compile(include), re.compile(exclude)
    return [
        filename for filename in filenames
        if (
            include_re.search(filename) and
            not exclude_re.search(filename) and
            os.path.lexists(filename)
        )
    ]
 def _filter_by_types(filenames, types, exclude_types):
    types, exclude_types = frozenset(types), frozenset(exclude_types)
    ret = []
    for filename in filenames:
        tags = tags_from_path(filename)
        if tags >= types and not tags & exclude_types:
            ret.append(filename)
    return tuple(ret)
 SKIPPED = 'Skipped'
 NO_FILES = '(no files to check)'
-def _run_single_hook(filenames, hook, args, skips, cols):
+def _run_single_hook(classifier, hook, args, skips, cols):
-    include, exclude = hook.files, hook.exclude
+    filenames = classifier.filenames_for_hook(hook)
    filenames = _filter_by_include_exclude(filenames, include, exclude)
    types, exclude_types = hook.types, hook.exclude_types
    filenames = _filter_by_types(filenames, types, exclude_types)
    if hook.language == 'pcre':
        logger.warning(
@ -193,10 +201,11 @@ def _run_hooks(config, hooks, args, environ):
    skips = _get_skips(environ)
    cols = _compute_cols(hooks, args.verbose)
    filenames = _all_filenames(args)
-    filenames = _filter_by_include_exclude(filenames, '', config['exclude'])
+    filenames = filter_by_include_exclude(filenames, '', config['exclude'])
    classifier = Classifier(filenames)
    retval = 0
    for hook in hooks:
-        retval |= _run_single_hook(filenames, hook, args, skips, cols)
+        retval |= _run_single_hook(classifier, hook, args, skips, cols)
        if retval and config['fail_fast']:
            break
    if retval and args.show_diff_on_failure and git.has_diff():
--- a/pre_commit/meta_hooks/check_hooks_apply.py
+++ b/pre_commit/meta_hooks/check_hooks_apply.py
@ -3,24 +3,19 @@ import argparse
 import pre_commit.constants as C
 from pre_commit import git
 from pre_commit.clientlib import load_config
-from pre_commit.commands.run import _filter_by_include_exclude
+from pre_commit.commands.run import Classifier
 from pre_commit.commands.run import _filter_by_types
 from pre_commit.repository import all_hooks
 from pre_commit.store import Store
 def check_all_hooks_match_files(config_file):
-    files = git.get_all_files()
+    classifier = Classifier(git.get_all_files())
    retv = 0
    for hook in all_hooks(load_config(config_file), Store()):
        if hook.always_run or hook.language == 'fail':
            continue
-        include, exclude = hook.files, hook.exclude
+        elif not classifier.filenames_for_hook(hook):
        filtered = _filter_by_include_exclude(files, include, exclude)
        types, exclude_types = hook.types, hook.exclude_types
        filtered = _filter_by_types(filtered, types, exclude_types)
        if not filtered:
            print('{} does not apply to this repository'.format(hook.id))
            retv = 1
--- a/pre_commit/meta_hooks/check_useless_excludes.py
+++ b/pre_commit/meta_hooks/check_useless_excludes.py
@ -9,7 +9,7 @@ import pre_commit.constants as C
 from pre_commit import git
 from pre_commit.clientlib import load_config
 from pre_commit.clientlib import MANIFEST_HOOK_DICT
-from pre_commit.commands.run import _filter_by_types
+from pre_commit.commands.run import Classifier
 def exclude_matches_any(filenames, include, exclude):
@ -24,11 +24,11 @@ def exclude_matches_any(filenames, include, exclude):
 def check_useless_excludes(config_file):
    config = load_config(config_file)
-    files = git.get_all_files()
+    classifier = Classifier(git.get_all_files())
    retv = 0
    exclude = config['exclude']
-    if not exclude_matches_any(files, '', exclude):
+    if not exclude_matches_any(classifier.filenames, '', exclude):
        print(
            'The global exclude pattern {!r} does not match any files'
            .format(exclude),
@ -40,10 +40,11 @@ def check_useless_excludes(config_file):
            # Not actually a manifest dict, but this more accurately reflects
            # the defaults applied during runtime
            hook = apply_defaults(hook, MANIFEST_HOOK_DICT)
            names = classifier.filenames
            types, exclude_types = hook['types'], hook['exclude_types']
-            filtered_by_types = _filter_by_types(files, types, exclude_types)
+            names = classifier.by_types(names, types, exclude_types)
            include, exclude = hook['files'], hook['exclude']
-            if not exclude_matches_any(filtered_by_types, include, exclude):
+            if not exclude_matches_any(names, include, exclude):
                print(
                    'The exclude pattern {!r} for {} does not match any files'
                    .format(exclude, hook['id']),
--- a/pre_commit/util.py
+++ b/pre_commit/util.py
@ -2,7 +2,6 @@ from __future__ import unicode_literals
 import contextlib
 import errno
 import functools
 import os.path
 import shutil
 import stat
@ -31,23 +30,6 @@ def mkdirp(path):
            raise
 def memoize_by_cwd(func):
    """Memoize a function call based on os.getcwd()."""
    @functools.wraps(func)
    def wrapper(*args):
        cwd = os.getcwd()
        key = (cwd,) + args
        try:
            return wrapper._cache[key]
        except KeyError:
            ret = wrapper._cache[key] = func(*args)
            return ret
    wrapper._cache = {}
    return wrapper
@contextlib.contextmanager
 def clean_path_on_failure(path):
    """Cleans up the directory on an exceptional failure."""
--- a/tests/commands/run_test.py
+++ b/tests/commands/run_test.py
@ -11,9 +11,10 @@ import pytest
 import pre_commit.constants as C
 from pre_commit.commands.install_uninstall import install
 from pre_commit.commands.run import _compute_cols
 from pre_commit.commands.run import _filter_by_include_exclude
 from pre_commit.commands.run import _get_skips
 from pre_commit.commands.run import _has_unmerged_paths
 from pre_commit.commands.run import Classifier
 from pre_commit.commands.run import filter_by_include_exclude
 from pre_commit.commands.run import run
 from pre_commit.util import cmd_output
 from pre_commit.util import make_executable
@ -748,18 +749,22 @@ def test_fail_fast(cap_out, store, repo_with_failing_hook):
    assert printed.count(b'Failing hook') == 1
 def test_classifier_removes_dne():
    classifier = Classifier(('this_file_does_not_exist',))
    assert classifier.filenames == []
@pytest.fixture
 def some_filenames():
    return (
        '.pre-commit-hooks.yaml',
        'im_a_file_that_doesnt_exist.py',
        'pre_commit/git.py',
        'pre_commit/main.py',
    )
 def test_include_exclude_base_case(some_filenames):
-    ret = _filter_by_include_exclude(some_filenames, '', '^$')
+    ret = filter_by_include_exclude(some_filenames, '', '^$')
    assert ret == [
        '.pre-commit-hooks.yaml',
        'pre_commit/git.py',
@ -771,22 +776,22 @@ def test_include_exclude_base_case(some_filenames):
 def test_matches_broken_symlink(tmpdir):
    with tmpdir.as_cwd():
        os.symlink('does-not-exist', 'link')
-        ret = _filter_by_include_exclude({'link'}, '', '^$')
+        ret = filter_by_include_exclude({'link'}, '', '^$')
        assert ret == ['link']
 def test_include_exclude_total_match(some_filenames):
-    ret = _filter_by_include_exclude(some_filenames, r'^.*\.py$', '^$')
+    ret = filter_by_include_exclude(some_filenames, r'^.*\.py$', '^$')
    assert ret == ['pre_commit/git.py', 'pre_commit/main.py']
 def test_include_exclude_does_search_instead_of_match(some_filenames):
-    ret = _filter_by_include_exclude(some_filenames, r'\.yaml$', '^$')
+    ret = filter_by_include_exclude(some_filenames, r'\.yaml$', '^$')
    assert ret == ['.pre-commit-hooks.yaml']
 def test_include_exclude_exclude_removes_files(some_filenames):
-    ret = _filter_by_include_exclude(some_filenames, '', r'\.py$')
+    ret = filter_by_include_exclude(some_filenames, '', r'\.py$')
    assert ret == ['.pre-commit-hooks.yaml']
--- a/tests/util_test.py
+++ b/tests/util_test.py
@ -1,17 +1,14 @@
 from __future__ import unicode_literals
 import os.path
 import random
 import pytest
 from pre_commit.util import CalledProcessError
 from pre_commit.util import clean_path_on_failure
 from pre_commit.util import cmd_output
 from pre_commit.util import memoize_by_cwd
 from pre_commit.util import parse_version
 from pre_commit.util import tmpdir
 from testing.util import cwd
 def test_CalledProcessError_str():
@ -42,37 +39,6 @@ def test_CalledProcessError_str_nooutput():
    )
@pytest.fixture
 def memoized_by_cwd():
    @memoize_by_cwd
    def func(arg):
        return arg + str(random.getrandbits(64))
    return func
 def test_memoized_by_cwd_returns_same_twice_in_a_row(memoized_by_cwd):
    ret = memoized_by_cwd('baz')
    ret2 = memoized_by_cwd('baz')
    assert ret is ret2
 def test_memoized_by_cwd_returns_different_for_different_args(memoized_by_cwd):
    ret = memoized_by_cwd('baz')
    ret2 = memoized_by_cwd('bar')
    assert ret.startswith('baz')
    assert ret2.startswith('bar')
    assert ret != ret2
 def test_memoized_by_cwd_changes_with_different_cwd(memoized_by_cwd):
    ret = memoized_by_cwd('baz')
    with cwd('.git'):
        ret2 = memoized_by_cwd('baz')
    assert ret != ret2
 def test_clean_on_failure_noop(in_tmpdir):
    with clean_path_on_failure('foo'):
        pass