Speed up filename filtering.

Before there was a `getcwd` syscall for every filename which was filtered. Instead this is now cached per-run. - When all files are identified by filename only: ~45% improvement - When no files are identified by filename only: ~55% improvement This makes little difference to overall execution, the bigger win is eliminating the `memoize_by_cwd` hack. Just removing the memoization would have *increased* the runtime by 300-500%.
2026-04-14 17:41:45 +04:00 · 2019-01-23 20:42:27 -08:00 · 2019-01-23 20:42:27 -08:00 · b1389603e0
commit b1389603e0
parent 38308dc02d
6 changed files with 61 additions and 103 deletions
--- a/pre_commit/commands/run.py
+++ b/pre_commit/commands/run.py
@ -17,14 +17,47 @@ from pre_commit.repository import all_hooks
 from pre_commit.repository import install_hook_envs
 from pre_commit.staged_files_only import staged_files_only
 from pre_commit.util import cmd_output
-from pre_commit.util import memoize_by_cwd
 from pre_commit.util import noop_context


 logger = logging.getLogger('pre_commit')


-tags_from_path = memoize_by_cwd(tags_from_path)
+def filter_by_include_exclude(names, include, exclude):
+    include_re, exclude_re = re.compile(include), re.compile(exclude)
+    return [
+        filename for filename in names
+        if include_re.search(filename)
+        if not exclude_re.search(filename)
+    ]
+
+
+class Classifier(object):
+    def __init__(self, filenames):
+        self.filenames = [f for f in filenames if os.path.lexists(f)]
+        self._types_cache = {}
+
+    def _types_for_file(self, filename):
+        try:
+            return self._types_cache[filename]
+        except KeyError:
+            ret = self._types_cache[filename] = tags_from_path(filename)
+            return ret
+
+    def by_types(self, names, types, exclude_types):
+        types, exclude_types = frozenset(types), frozenset(exclude_types)
+        ret = []
+        for filename in names:
+            tags = self._types_for_file(filename)
+            if tags >= types and not tags & exclude_types:
+                ret.append(filename)
+        return ret
+
+    def filenames_for_hook(self, hook):
+        names = self.filenames
+        names = filter_by_include_exclude(names, hook.files, hook.exclude)
+        names = self.by_types(names, hook.types, hook.exclude_types)
+        return names


 def _get_skips(environ):
@ -36,37 +69,12 @@ def _hook_msg_start(hook, verbose):
    return '{}{}'.format('[{}] '.format(hook.id) if verbose else '', hook.name)


-def _filter_by_include_exclude(filenames, include, exclude):
-    include_re, exclude_re = re.compile(include), re.compile(exclude)
-    return [
-        filename for filename in filenames
-        if (
-            include_re.search(filename) and
-            not exclude_re.search(filename) and
-            os.path.lexists(filename)
-        )
-    ]
-
-
-def _filter_by_types(filenames, types, exclude_types):
-    types, exclude_types = frozenset(types), frozenset(exclude_types)
-    ret = []
-    for filename in filenames:
-        tags = tags_from_path(filename)
-        if tags >= types and not tags & exclude_types:
-            ret.append(filename)
-    return tuple(ret)
-
-
 SKIPPED = 'Skipped'
 NO_FILES = '(no files to check)'


-def _run_single_hook(filenames, hook, args, skips, cols):
-    include, exclude = hook.files, hook.exclude
-    filenames = _filter_by_include_exclude(filenames, include, exclude)
-    types, exclude_types = hook.types, hook.exclude_types
-    filenames = _filter_by_types(filenames, types, exclude_types)
+def _run_single_hook(classifier, hook, args, skips, cols):
+    filenames = classifier.filenames_for_hook(hook)

    if hook.language == 'pcre':
        logger.warning(
@ -193,10 +201,11 @@ def _run_hooks(config, hooks, args, environ):
    skips = _get_skips(environ)
    cols = _compute_cols(hooks, args.verbose)
    filenames = _all_filenames(args)
-    filenames = _filter_by_include_exclude(filenames, '', config['exclude'])
+    filenames = filter_by_include_exclude(filenames, '', config['exclude'])
+    classifier = Classifier(filenames)
    retval = 0
    for hook in hooks:
-        retval |= _run_single_hook(filenames, hook, args, skips, cols)
+        retval |= _run_single_hook(classifier, hook, args, skips, cols)
        if retval and config['fail_fast']:
            break
    if retval and args.show_diff_on_failure and git.has_diff():