Speed up filename filtering.

Before there was a `getcwd` syscall for every filename which was filtered.

Instead this is now cached per-run.

- When all files are identified by filename only: ~45% improvement
- When no files are identified by filename only: ~55% improvement

This makes little difference to overall execution, the bigger win is
eliminating the `memoize_by_cwd` hack.  Just removing the memoization would
have *increased* the runtime by 300-500%.
This commit is contained in:
Anthony Sottile 2019-01-23 20:42:27 -08:00
parent 38308dc02d
commit b1389603e0
6 changed files with 61 additions and 103 deletions

View file

@ -17,14 +17,47 @@ from pre_commit.repository import all_hooks
from pre_commit.repository import install_hook_envs
from pre_commit.staged_files_only import staged_files_only
from pre_commit.util import cmd_output
from pre_commit.util import memoize_by_cwd
from pre_commit.util import noop_context
logger = logging.getLogger('pre_commit')
tags_from_path = memoize_by_cwd(tags_from_path)
def filter_by_include_exclude(names, include, exclude):
include_re, exclude_re = re.compile(include), re.compile(exclude)
return [
filename for filename in names
if include_re.search(filename)
if not exclude_re.search(filename)
]
class Classifier(object):
def __init__(self, filenames):
self.filenames = [f for f in filenames if os.path.lexists(f)]
self._types_cache = {}
def _types_for_file(self, filename):
try:
return self._types_cache[filename]
except KeyError:
ret = self._types_cache[filename] = tags_from_path(filename)
return ret
def by_types(self, names, types, exclude_types):
types, exclude_types = frozenset(types), frozenset(exclude_types)
ret = []
for filename in names:
tags = self._types_for_file(filename)
if tags >= types and not tags & exclude_types:
ret.append(filename)
return ret
def filenames_for_hook(self, hook):
names = self.filenames
names = filter_by_include_exclude(names, hook.files, hook.exclude)
names = self.by_types(names, hook.types, hook.exclude_types)
return names
def _get_skips(environ):
@ -36,37 +69,12 @@ def _hook_msg_start(hook, verbose):
return '{}{}'.format('[{}] '.format(hook.id) if verbose else '', hook.name)
def _filter_by_include_exclude(filenames, include, exclude):
include_re, exclude_re = re.compile(include), re.compile(exclude)
return [
filename for filename in filenames
if (
include_re.search(filename) and
not exclude_re.search(filename) and
os.path.lexists(filename)
)
]
def _filter_by_types(filenames, types, exclude_types):
types, exclude_types = frozenset(types), frozenset(exclude_types)
ret = []
for filename in filenames:
tags = tags_from_path(filename)
if tags >= types and not tags & exclude_types:
ret.append(filename)
return tuple(ret)
SKIPPED = 'Skipped'
NO_FILES = '(no files to check)'
def _run_single_hook(filenames, hook, args, skips, cols):
include, exclude = hook.files, hook.exclude
filenames = _filter_by_include_exclude(filenames, include, exclude)
types, exclude_types = hook.types, hook.exclude_types
filenames = _filter_by_types(filenames, types, exclude_types)
def _run_single_hook(classifier, hook, args, skips, cols):
filenames = classifier.filenames_for_hook(hook)
if hook.language == 'pcre':
logger.warning(
@ -193,10 +201,11 @@ def _run_hooks(config, hooks, args, environ):
skips = _get_skips(environ)
cols = _compute_cols(hooks, args.verbose)
filenames = _all_filenames(args)
filenames = _filter_by_include_exclude(filenames, '', config['exclude'])
filenames = filter_by_include_exclude(filenames, '', config['exclude'])
classifier = Classifier(filenames)
retval = 0
for hook in hooks:
retval |= _run_single_hook(filenames, hook, args, skips, cols)
retval |= _run_single_hook(classifier, hook, args, skips, cols)
if retval and config['fail_fast']:
break
if retval and args.show_diff_on_failure and git.has_diff():

View file

@ -3,24 +3,19 @@ import argparse
import pre_commit.constants as C
from pre_commit import git
from pre_commit.clientlib import load_config
from pre_commit.commands.run import _filter_by_include_exclude
from pre_commit.commands.run import _filter_by_types
from pre_commit.commands.run import Classifier
from pre_commit.repository import all_hooks
from pre_commit.store import Store
def check_all_hooks_match_files(config_file):
files = git.get_all_files()
classifier = Classifier(git.get_all_files())
retv = 0
for hook in all_hooks(load_config(config_file), Store()):
if hook.always_run or hook.language == 'fail':
continue
include, exclude = hook.files, hook.exclude
filtered = _filter_by_include_exclude(files, include, exclude)
types, exclude_types = hook.types, hook.exclude_types
filtered = _filter_by_types(filtered, types, exclude_types)
if not filtered:
elif not classifier.filenames_for_hook(hook):
print('{} does not apply to this repository'.format(hook.id))
retv = 1

View file

@ -9,7 +9,7 @@ import pre_commit.constants as C
from pre_commit import git
from pre_commit.clientlib import load_config
from pre_commit.clientlib import MANIFEST_HOOK_DICT
from pre_commit.commands.run import _filter_by_types
from pre_commit.commands.run import Classifier
def exclude_matches_any(filenames, include, exclude):
@ -24,11 +24,11 @@ def exclude_matches_any(filenames, include, exclude):
def check_useless_excludes(config_file):
config = load_config(config_file)
files = git.get_all_files()
classifier = Classifier(git.get_all_files())
retv = 0
exclude = config['exclude']
if not exclude_matches_any(files, '', exclude):
if not exclude_matches_any(classifier.filenames, '', exclude):
print(
'The global exclude pattern {!r} does not match any files'
.format(exclude),
@ -40,10 +40,11 @@ def check_useless_excludes(config_file):
# Not actually a manifest dict, but this more accurately reflects
# the defaults applied during runtime
hook = apply_defaults(hook, MANIFEST_HOOK_DICT)
names = classifier.filenames
types, exclude_types = hook['types'], hook['exclude_types']
filtered_by_types = _filter_by_types(files, types, exclude_types)
names = classifier.by_types(names, types, exclude_types)
include, exclude = hook['files'], hook['exclude']
if not exclude_matches_any(filtered_by_types, include, exclude):
if not exclude_matches_any(names, include, exclude):
print(
'The exclude pattern {!r} for {} does not match any files'
.format(exclude, hook['id']),

View file

@ -2,7 +2,6 @@ from __future__ import unicode_literals
import contextlib
import errno
import functools
import os.path
import shutil
import stat
@ -31,23 +30,6 @@ def mkdirp(path):
raise
def memoize_by_cwd(func):
"""Memoize a function call based on os.getcwd()."""
@functools.wraps(func)
def wrapper(*args):
cwd = os.getcwd()
key = (cwd,) + args
try:
return wrapper._cache[key]
except KeyError:
ret = wrapper._cache[key] = func(*args)
return ret
wrapper._cache = {}
return wrapper
@contextlib.contextmanager
def clean_path_on_failure(path):
"""Cleans up the directory on an exceptional failure."""