mirror of
https://github.com/pre-commit/pre-commit.git
synced 2026-04-14 17:41:45 +04:00
add pass_filenames_via_stdin for large changesets
pre-commit currently passes selected filenames to hooks via argv.
For large changesets (or --all-files), argv length limits are hit and
filenames are partitioned, causing multiple hook invocations.
This means there is currently no built-in way to pass filenames to an
underlying hook in one shot without chunking / re-running. The only practical
workaround is to set pass_filenames: false and run custom git operations in
hook code to reconstruct the file set, which is expensive and duplicates
pre-commit's own file-selection logic.
This change adds a hook option:
pass_filenames_via_stdin: true
When enabled, pre-commit sends filenames as NUL-delimited bytes on stdin and
runs the hook in a single invocation (no argv chunking).
Why NUL-delimited stdin:
- safe for filenames containing spaces/newlines
- matches established -0 conventions in unix tooling
Usage for hook authors:
- shell:
while IFS= read -r -d '' filename; do
...
done
- python:
data = sys.stdin.buffer.read()
filenames = [os.fsdecode(p) for p in data.split(b'\0') if p]
Behavior notes:
- default remains argv-based passing
- pass_filenames: false still disables filename passing entirely
Implementation includes schema/runtime wiring, shared NUL encode/decode
helpers, and tests covering defaulting and runtime behavior.
This commit is contained in:
parent
8416413a0e
commit
635912514d
18 changed files with 147 additions and 2 deletions
|
|
@ -256,6 +256,7 @@ MANIFEST_HOOK_DICT = cfgv.Map(
|
|||
cfgv.Optional('always_run', cfgv.check_bool, False),
|
||||
cfgv.Optional('fail_fast', cfgv.check_bool, False),
|
||||
cfgv.Optional('pass_filenames', cfgv.check_bool, True),
|
||||
cfgv.Optional('pass_filenames_via_stdin', cfgv.check_bool, False),
|
||||
cfgv.Optional('description', cfgv.check_string, ''),
|
||||
cfgv.Optional('language_version', cfgv.check_string, C.DEFAULT),
|
||||
cfgv.Optional('log_file', cfgv.check_string, ''),
|
||||
|
|
|
|||
|
|
@ -185,8 +185,10 @@ def _run_single_hook(
|
|||
# print hook and dots first in case the hook takes a while to run
|
||||
output.write(_start_msg(start=hook.name, end_len=6, cols=cols))
|
||||
|
||||
pass_filenames_via_stdin = hook.pass_filenames_via_stdin
|
||||
if not hook.pass_filenames:
|
||||
filenames = ()
|
||||
pass_filenames_via_stdin = False
|
||||
time_before = time.monotonic()
|
||||
language = languages[hook.language]
|
||||
with language.in_env(hook.prefix, hook.language_version):
|
||||
|
|
@ -198,6 +200,7 @@ def _run_single_hook(
|
|||
is_local=hook.src == 'local',
|
||||
require_serial=hook.require_serial,
|
||||
color=use_color,
|
||||
pass_filenames_via_stdin=pass_filenames_via_stdin,
|
||||
)
|
||||
duration = round(time.monotonic() - time_before, 2) or 0
|
||||
diff_after = _get_diff()
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ class Hook(NamedTuple):
|
|||
always_run: bool
|
||||
fail_fast: bool
|
||||
pass_filenames: bool
|
||||
pass_filenames_via_stdin: bool
|
||||
description: str
|
||||
language_version: str
|
||||
log_file: str
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ from pre_commit.util import cmd_output_b
|
|||
FIXED_RANDOM_SEED = 1542676187
|
||||
|
||||
SHIMS_RE = re.compile(r'[/\\]shims[/\\]')
|
||||
NUL = b'\0'
|
||||
|
||||
|
||||
class Language(Protocol):
|
||||
|
|
@ -56,6 +57,7 @@ class Language(Protocol):
|
|||
is_local: bool,
|
||||
require_serial: bool,
|
||||
color: bool,
|
||||
pass_filenames_via_stdin: bool = False,
|
||||
) -> tuple[int, bytes]:
|
||||
...
|
||||
|
||||
|
|
@ -153,13 +155,33 @@ def _shuffled(seq: Sequence[str]) -> list[str]:
|
|||
return seq
|
||||
|
||||
|
||||
def to_nul_delimited_filenames(file_args: Sequence[str]) -> bytes:
|
||||
ret = NUL.join(os.fsencode(filename) for filename in file_args)
|
||||
return ret + NUL if ret else ret
|
||||
|
||||
|
||||
def from_nul_delimited_filenames(filenames: bytes) -> list[str]:
|
||||
return [os.fsdecode(part) for part in filenames.split(NUL) if part]
|
||||
|
||||
|
||||
def run_xargs(
|
||||
cmd: tuple[str, ...],
|
||||
file_args: Sequence[str],
|
||||
*,
|
||||
require_serial: bool,
|
||||
color: bool,
|
||||
pass_filenames_via_stdin: bool = False,
|
||||
) -> tuple[int, bytes]:
|
||||
if pass_filenames_via_stdin:
|
||||
stdin = to_nul_delimited_filenames(file_args)
|
||||
return xargs.xargs(
|
||||
cmd,
|
||||
(),
|
||||
target_concurrency=1,
|
||||
color=color,
|
||||
input=stdin,
|
||||
)
|
||||
|
||||
if require_serial:
|
||||
jobs = 1
|
||||
else:
|
||||
|
|
@ -187,10 +209,12 @@ def basic_run_hook(
|
|||
is_local: bool,
|
||||
require_serial: bool,
|
||||
color: bool,
|
||||
pass_filenames_via_stdin: bool = False,
|
||||
) -> tuple[int, bytes]:
|
||||
return run_xargs(
|
||||
hook_cmd(entry, args),
|
||||
file_args,
|
||||
require_serial=require_serial,
|
||||
color=color,
|
||||
pass_filenames_via_stdin=pass_filenames_via_stdin,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -165,6 +165,7 @@ def run_hook(
|
|||
is_local: bool,
|
||||
require_serial: bool,
|
||||
color: bool,
|
||||
pass_filenames_via_stdin: bool = False,
|
||||
) -> tuple[int, bytes]: # pragma: win32 no cover
|
||||
# Rebuild the docker image in case it has gone missing, as many people do
|
||||
# automated cleanup of docker images.
|
||||
|
|
@ -178,4 +179,5 @@ def run_hook(
|
|||
file_args,
|
||||
require_serial=require_serial,
|
||||
color=color,
|
||||
pass_filenames_via_stdin=pass_filenames_via_stdin,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ def run_hook(
|
|||
is_local: bool,
|
||||
require_serial: bool,
|
||||
color: bool,
|
||||
pass_filenames_via_stdin: bool = False,
|
||||
) -> tuple[int, bytes]: # pragma: win32 no cover
|
||||
cmd = docker_cmd(color=color) + lang_base.hook_cmd(entry, args)
|
||||
return lang_base.run_xargs(
|
||||
|
|
@ -29,4 +30,5 @@ def run_hook(
|
|||
file_args,
|
||||
require_serial=require_serial,
|
||||
color=color,
|
||||
pass_filenames_via_stdin=pass_filenames_via_stdin,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ def run_hook(
|
|||
is_local: bool,
|
||||
require_serial: bool,
|
||||
color: bool,
|
||||
pass_filenames_via_stdin: bool = False,
|
||||
) -> tuple[int, bytes]:
|
||||
out = f'{entry}\n\n'.encode()
|
||||
out += b'\n'.join(f.encode() for f in file_args) + b'\n'
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ def run_hook(
|
|||
is_local: bool,
|
||||
require_serial: bool,
|
||||
color: bool,
|
||||
pass_filenames_via_stdin: bool = False,
|
||||
) -> tuple[int, bytes]:
|
||||
# `entry` is a (hook-repo relative) file followed by (optional) args, e.g.
|
||||
# `bin/id.jl` or `bin/hook.jl --arg1 --arg2` so we
|
||||
|
|
@ -43,6 +44,7 @@ def run_hook(
|
|||
file_args,
|
||||
require_serial=require_serial,
|
||||
color=color,
|
||||
pass_filenames_via_stdin=pass_filenames_via_stdin,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -96,8 +96,12 @@ def run_hook(
|
|||
is_local: bool,
|
||||
require_serial: bool,
|
||||
color: bool,
|
||||
pass_filenames_via_stdin: bool = False,
|
||||
) -> tuple[int, bytes]:
|
||||
cmd = (sys.executable, '-m', __name__, *args, entry)
|
||||
if pass_filenames_via_stdin:
|
||||
stdin = lang_base.to_nul_delimited_filenames(file_args)
|
||||
return xargs(cmd, (), color=color, input=stdin)
|
||||
return xargs(cmd, file_args, color=color)
|
||||
|
||||
|
||||
|
|
@ -116,6 +120,11 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||
parser.add_argument('filenames', nargs='*')
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if not args.filenames:
|
||||
stdin = sys.stdin.buffer.read()
|
||||
if stdin:
|
||||
args.filenames = lang_base.from_nul_delimited_filenames(stdin)
|
||||
|
||||
flags = re.IGNORECASE if args.ignore_case else 0
|
||||
if args.multiline:
|
||||
flags |= re.MULTILINE | re.DOTALL
|
||||
|
|
|
|||
|
|
@ -268,6 +268,7 @@ def run_hook(
|
|||
is_local: bool,
|
||||
require_serial: bool,
|
||||
color: bool,
|
||||
pass_filenames_via_stdin: bool = False,
|
||||
) -> tuple[int, bytes]:
|
||||
cmd = _cmd_from_hook(prefix, entry, args, is_local=is_local)
|
||||
return lang_base.run_xargs(
|
||||
|
|
@ -275,4 +276,5 @@ def run_hook(
|
|||
file_args,
|
||||
require_serial=require_serial,
|
||||
color=color,
|
||||
pass_filenames_via_stdin=pass_filenames_via_stdin,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ def run_hook(
|
|||
is_local: bool,
|
||||
require_serial: bool,
|
||||
color: bool,
|
||||
pass_filenames_via_stdin: bool = False,
|
||||
) -> tuple[int, bytes]:
|
||||
cmd = lang_base.hook_cmd(entry, args)
|
||||
cmd = (prefix.path(cmd[0]), *cmd[1:])
|
||||
|
|
@ -29,4 +30,5 @@ def run_hook(
|
|||
file_args,
|
||||
require_serial=require_serial,
|
||||
color=color,
|
||||
pass_filenames_via_stdin=pass_filenames_via_stdin,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -92,6 +92,7 @@ def cmd_output_b(
|
|||
check: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> tuple[int, bytes, bytes | None]:
|
||||
input_data = kwargs.pop('input', None)
|
||||
_setdefault_kwargs(kwargs)
|
||||
|
||||
try:
|
||||
|
|
@ -104,7 +105,7 @@ def cmd_output_b(
|
|||
except OSError as e:
|
||||
returncode, stdout_b, stderr_b = _oserror_to_output(e)
|
||||
else:
|
||||
stdout_b, stderr_b = proc.communicate()
|
||||
stdout_b, stderr_b = proc.communicate(input_data)
|
||||
returncode = proc.returncode
|
||||
|
||||
if check and returncode:
|
||||
|
|
|
|||
|
|
@ -142,7 +142,8 @@ def xargs(
|
|||
color: Make a pty if on a platform that supports it
|
||||
target_concurrency: Target number of partitions to run concurrently
|
||||
"""
|
||||
cmd_fn = cmd_output_p if color else cmd_output_b
|
||||
use_pty = color and kwargs.get('input') is None
|
||||
cmd_fn = cmd_output_p if use_pty else cmd_output_b
|
||||
retcode = 0
|
||||
stdout = b''
|
||||
|
||||
|
|
@ -164,6 +165,8 @@ def xargs(
|
|||
_max_length = 8192 - len(cmd_exe) - len(' /c ') - 1024
|
||||
|
||||
partitions = partition(cmd, varargs, target_concurrency, _max_length)
|
||||
if kwargs.get('input') is not None and len(partitions) != 1:
|
||||
raise AssertionError('`input` is only supported with one partition')
|
||||
|
||||
def run_cmd_partition(
|
||||
run_cmd: tuple[str, ...],
|
||||
|
|
|
|||
|
|
@ -572,6 +572,18 @@ def test_manifest_stages_defaulting():
|
|||
]
|
||||
|
||||
|
||||
def test_manifest_pass_filenames_via_stdin_defaulting():
|
||||
dct = {
|
||||
'id': 'fake-hook',
|
||||
'name': 'fake-hook',
|
||||
'entry': 'fake-hook',
|
||||
'language': 'system',
|
||||
}
|
||||
cfgv.validate(dct, MANIFEST_HOOK_DICT)
|
||||
dct = cfgv.apply_defaults(dct, MANIFEST_HOOK_DICT)
|
||||
assert dct['pass_filenames_via_stdin'] is False
|
||||
|
||||
|
||||
def test_config_hook_stages_defaulting_missing():
|
||||
dct = {'id': 'fake-hook'}
|
||||
cfgv.validate(dct, CONFIG_HOOK_DICT)
|
||||
|
|
|
|||
|
|
@ -1064,6 +1064,35 @@ def test_pass_filenames(
|
|||
assert (b'foo.py' in printed) == pass_filenames
|
||||
|
||||
|
||||
def test_pass_filenames_via_stdin(cap_out, store, repo_with_passing_hook):
|
||||
config = {
|
||||
'repo': 'local',
|
||||
'hooks': [{
|
||||
'id': 'filenames-via-stdin',
|
||||
'name': 'filenames-via-stdin',
|
||||
'entry': (
|
||||
f'{shlex.quote(sys.executable)} -c '
|
||||
'\'import sys; '
|
||||
'print(repr(sys.argv[1:])); '
|
||||
'print(repr(sys.stdin.buffer.read()))\''
|
||||
),
|
||||
'language': 'system',
|
||||
'pass_filenames_via_stdin': True,
|
||||
}],
|
||||
}
|
||||
add_config_to_repo(repo_with_passing_hook, config)
|
||||
stage_a_file()
|
||||
|
||||
ret, printed = _do_run(
|
||||
cap_out, store, repo_with_passing_hook,
|
||||
run_opts(hook='filenames-via-stdin', verbose=True),
|
||||
)
|
||||
|
||||
assert ret == 0
|
||||
assert b'[]' in printed
|
||||
assert b"b'foo.py\\x00'" in printed
|
||||
|
||||
|
||||
def test_fail_fast(cap_out, store, repo_with_failing_hook):
|
||||
with modify_config() as config:
|
||||
# More than one hook
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os.path
|
||||
import shlex
|
||||
import sys
|
||||
from unittest import mock
|
||||
|
||||
|
|
@ -166,6 +167,42 @@ def test_basic_run_hook(tmp_path):
|
|||
assert out == b'hi hello file file file\n'
|
||||
|
||||
|
||||
def test_basic_run_hook_passes_filenames_via_stdin(tmp_path):
|
||||
ret, out = lang_base.basic_run_hook(
|
||||
Prefix(tmp_path),
|
||||
(
|
||||
f'{shlex.quote(sys.executable)} -c '
|
||||
'\'import sys; '
|
||||
'print(repr(sys.argv[1:])); '
|
||||
'print(repr(sys.stdin.buffer.read()))\''
|
||||
),
|
||||
(),
|
||||
['file1', 'file2'],
|
||||
is_local=False,
|
||||
require_serial=False,
|
||||
color=False,
|
||||
pass_filenames_via_stdin=True,
|
||||
)
|
||||
assert ret == 0
|
||||
out = out.replace(b'\r\n', b'\n')
|
||||
assert out == b"[]\nb'file1\\x00file2\\x00'\n"
|
||||
|
||||
|
||||
def test_to_nul_delimited_filenames():
|
||||
ret = lang_base.to_nul_delimited_filenames(('file1', 'file2'))
|
||||
assert ret == b'file1\x00file2\x00'
|
||||
|
||||
|
||||
def test_to_nul_delimited_filenames_empty():
|
||||
ret = lang_base.to_nul_delimited_filenames(())
|
||||
assert ret == b''
|
||||
|
||||
|
||||
def test_from_nul_delimited_filenames():
|
||||
ret = lang_base.from_nul_delimited_filenames(b'file1\x00file2\x00')
|
||||
assert ret == ['file1', 'file2']
|
||||
|
||||
|
||||
def test_hook_cmd():
|
||||
assert lang_base.hook_cmd('echo hi', ()) == ('echo', 'hi')
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
||||
from pre_commit.languages import pygrep
|
||||
|
|
@ -138,6 +141,16 @@ def test_grep_hook_matching(some_files, tmp_path):
|
|||
assert ret == (1, b"f7:1:hello'hi\n")
|
||||
|
||||
|
||||
@pytest.mark.usefixtures('some_files')
|
||||
def test_main_reads_nul_delimited_filenames_from_stdin(cap_out):
|
||||
with mock.patch.object(sys.stdin.buffer, 'read', return_value=b'f1\x00f2\x00'):
|
||||
ret = pygrep.main(('foo',))
|
||||
|
||||
out = cap_out.get()
|
||||
assert ret == 1
|
||||
assert out == 'f1:1:foo\n'
|
||||
|
||||
|
||||
@pytest.mark.parametrize('regex', ('nope', "foo'bar", r'^\[INFO\]'))
|
||||
def test_grep_hook_not_matching(regex, some_files, tmp_path):
|
||||
ret = run_language(tmp_path, pygrep, regex, file_args=('f7', 'f8', 'f9'))
|
||||
|
|
|
|||
|
|
@ -430,6 +430,7 @@ def test_manifest_hooks(tempdir_factory, store):
|
|||
minimum_pre_commit_version='0',
|
||||
name='Bash hook',
|
||||
pass_filenames=True,
|
||||
pass_filenames_via_stdin=False,
|
||||
require_serial=False,
|
||||
stages=[
|
||||
'commit-msg',
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue