From 635912514db02895591567a2c59c95931c76faae Mon Sep 17 00:00:00 2001 From: Sharmila Jesupaul Date: Wed, 18 Feb 2026 18:01:55 -0800 Subject: [PATCH] add pass_filenames_via_stdin for large changesets pre-commit currently passes selected filenames to hooks via argv. For large changesets (or --all-files), argv length limits are hit and filenames are partitioned, causing multiple hook invocations. This means there is currently no built-in way to pass filenames to an underlying hook in one shot without chunking / re-running. The only practical workaround is to set pass_filenames: false and run custom git operations in hook code to reconstruct the file set, which is expensive and duplicates pre-commit's own file-selection logic. This change adds a hook option: pass_filenames_via_stdin: true When enabled, pre-commit sends filenames as NUL-delimited bytes on stdin and runs the hook in a single invocation (no argv chunking). Why NUL-delimited stdin: - safe for filenames containing spaces/newlines - matches established -0 conventions in unix tooling Usage for hook authors: - shell: while IFS= read -r -d '' filename; do ... done - python: data = sys.stdin.buffer.read() filenames = [os.fsdecode(p) for p in data.split(b'\0') if p] Behavior notes: - default remains argv-based passing - pass_filenames: false still disables filename passing entirely Implementation includes schema/runtime wiring, shared NUL encode/decode helpers, and tests covering defaulting and runtime behavior. --- pre_commit/clientlib.py | 1 + pre_commit/commands/run.py | 3 ++ pre_commit/hook.py | 1 + pre_commit/lang_base.py | 24 ++++++++++++++ pre_commit/languages/docker.py | 2 ++ pre_commit/languages/docker_image.py | 2 ++ pre_commit/languages/fail.py | 1 + pre_commit/languages/julia.py | 2 ++ pre_commit/languages/pygrep.py | 9 ++++++ pre_commit/languages/r.py | 2 ++ pre_commit/languages/unsupported_script.py | 2 ++ pre_commit/util.py | 3 +- pre_commit/xargs.py | 5 ++- tests/clientlib_test.py | 12 +++++++ tests/commands/run_test.py | 29 +++++++++++++++++ tests/lang_base_test.py | 37 ++++++++++++++++++++++ tests/languages/pygrep_test.py | 13 ++++++++ tests/repository_test.py | 1 + 18 files changed, 147 insertions(+), 2 deletions(-) diff --git a/pre_commit/clientlib.py b/pre_commit/clientlib.py index 51f14d26..57a64c10 100644 --- a/pre_commit/clientlib.py +++ b/pre_commit/clientlib.py @@ -256,6 +256,7 @@ MANIFEST_HOOK_DICT = cfgv.Map( cfgv.Optional('always_run', cfgv.check_bool, False), cfgv.Optional('fail_fast', cfgv.check_bool, False), cfgv.Optional('pass_filenames', cfgv.check_bool, True), + cfgv.Optional('pass_filenames_via_stdin', cfgv.check_bool, False), cfgv.Optional('description', cfgv.check_string, ''), cfgv.Optional('language_version', cfgv.check_string, C.DEFAULT), cfgv.Optional('log_file', cfgv.check_string, ''), diff --git a/pre_commit/commands/run.py b/pre_commit/commands/run.py index 8ab505ff..de83df17 100644 --- a/pre_commit/commands/run.py +++ b/pre_commit/commands/run.py @@ -185,8 +185,10 @@ def _run_single_hook( # print hook and dots first in case the hook takes a while to run output.write(_start_msg(start=hook.name, end_len=6, cols=cols)) + pass_filenames_via_stdin = hook.pass_filenames_via_stdin if not hook.pass_filenames: filenames = () + pass_filenames_via_stdin = False time_before = time.monotonic() language = languages[hook.language] with language.in_env(hook.prefix, hook.language_version): @@ -198,6 +200,7 @@ def _run_single_hook( is_local=hook.src == 'local', require_serial=hook.require_serial, color=use_color, + pass_filenames_via_stdin=pass_filenames_via_stdin, ) duration = round(time.monotonic() - time_before, 2) or 0 diff_after = _get_diff() diff --git a/pre_commit/hook.py b/pre_commit/hook.py index 309cd5be..af5e3c28 100644 --- a/pre_commit/hook.py +++ b/pre_commit/hook.py @@ -28,6 +28,7 @@ class Hook(NamedTuple): always_run: bool fail_fast: bool pass_filenames: bool + pass_filenames_via_stdin: bool description: str language_version: str log_file: str diff --git a/pre_commit/lang_base.py b/pre_commit/lang_base.py index 198e9365..bf027364 100644 --- a/pre_commit/lang_base.py +++ b/pre_commit/lang_base.py @@ -22,6 +22,7 @@ from pre_commit.util import cmd_output_b FIXED_RANDOM_SEED = 1542676187 SHIMS_RE = re.compile(r'[/\\]shims[/\\]') +NUL = b'\0' class Language(Protocol): @@ -56,6 +57,7 @@ class Language(Protocol): is_local: bool, require_serial: bool, color: bool, + pass_filenames_via_stdin: bool = False, ) -> tuple[int, bytes]: ... @@ -153,13 +155,33 @@ def _shuffled(seq: Sequence[str]) -> list[str]: return seq +def to_nul_delimited_filenames(file_args: Sequence[str]) -> bytes: + ret = NUL.join(os.fsencode(filename) for filename in file_args) + return ret + NUL if ret else ret + + +def from_nul_delimited_filenames(filenames: bytes) -> list[str]: + return [os.fsdecode(part) for part in filenames.split(NUL) if part] + + def run_xargs( cmd: tuple[str, ...], file_args: Sequence[str], *, require_serial: bool, color: bool, + pass_filenames_via_stdin: bool = False, ) -> tuple[int, bytes]: + if pass_filenames_via_stdin: + stdin = to_nul_delimited_filenames(file_args) + return xargs.xargs( + cmd, + (), + target_concurrency=1, + color=color, + input=stdin, + ) + if require_serial: jobs = 1 else: @@ -187,10 +209,12 @@ def basic_run_hook( is_local: bool, require_serial: bool, color: bool, + pass_filenames_via_stdin: bool = False, ) -> tuple[int, bytes]: return run_xargs( hook_cmd(entry, args), file_args, require_serial=require_serial, color=color, + pass_filenames_via_stdin=pass_filenames_via_stdin, ) diff --git a/pre_commit/languages/docker.py b/pre_commit/languages/docker.py index 7f45ac86..be09c73d 100644 --- a/pre_commit/languages/docker.py +++ b/pre_commit/languages/docker.py @@ -165,6 +165,7 @@ def run_hook( is_local: bool, require_serial: bool, color: bool, + pass_filenames_via_stdin: bool = False, ) -> tuple[int, bytes]: # pragma: win32 no cover # Rebuild the docker image in case it has gone missing, as many people do # automated cleanup of docker images. @@ -178,4 +179,5 @@ def run_hook( file_args, require_serial=require_serial, color=color, + pass_filenames_via_stdin=pass_filenames_via_stdin, ) diff --git a/pre_commit/languages/docker_image.py b/pre_commit/languages/docker_image.py index 60caa101..69aab050 100644 --- a/pre_commit/languages/docker_image.py +++ b/pre_commit/languages/docker_image.py @@ -22,6 +22,7 @@ def run_hook( is_local: bool, require_serial: bool, color: bool, + pass_filenames_via_stdin: bool = False, ) -> tuple[int, bytes]: # pragma: win32 no cover cmd = docker_cmd(color=color) + lang_base.hook_cmd(entry, args) return lang_base.run_xargs( @@ -29,4 +30,5 @@ def run_hook( file_args, require_serial=require_serial, color=color, + pass_filenames_via_stdin=pass_filenames_via_stdin, ) diff --git a/pre_commit/languages/fail.py b/pre_commit/languages/fail.py index 6ac4d767..7a297ca5 100644 --- a/pre_commit/languages/fail.py +++ b/pre_commit/languages/fail.py @@ -21,6 +21,7 @@ def run_hook( is_local: bool, require_serial: bool, color: bool, + pass_filenames_via_stdin: bool = False, ) -> tuple[int, bytes]: out = f'{entry}\n\n'.encode() out += b'\n'.join(f.encode() for f in file_args) + b'\n' diff --git a/pre_commit/languages/julia.py b/pre_commit/languages/julia.py index 7559b5ba..245b1751 100644 --- a/pre_commit/languages/julia.py +++ b/pre_commit/languages/julia.py @@ -27,6 +27,7 @@ def run_hook( is_local: bool, require_serial: bool, color: bool, + pass_filenames_via_stdin: bool = False, ) -> tuple[int, bytes]: # `entry` is a (hook-repo relative) file followed by (optional) args, e.g. # `bin/id.jl` or `bin/hook.jl --arg1 --arg2` so we @@ -43,6 +44,7 @@ def run_hook( file_args, require_serial=require_serial, color=color, + pass_filenames_via_stdin=pass_filenames_via_stdin, ) diff --git a/pre_commit/languages/pygrep.py b/pre_commit/languages/pygrep.py index 72a9345f..afd84658 100644 --- a/pre_commit/languages/pygrep.py +++ b/pre_commit/languages/pygrep.py @@ -96,8 +96,12 @@ def run_hook( is_local: bool, require_serial: bool, color: bool, + pass_filenames_via_stdin: bool = False, ) -> tuple[int, bytes]: cmd = (sys.executable, '-m', __name__, *args, entry) + if pass_filenames_via_stdin: + stdin = lang_base.to_nul_delimited_filenames(file_args) + return xargs(cmd, (), color=color, input=stdin) return xargs(cmd, file_args, color=color) @@ -116,6 +120,11 @@ def main(argv: Sequence[str] | None = None) -> int: parser.add_argument('filenames', nargs='*') args = parser.parse_args(argv) + if not args.filenames: + stdin = sys.stdin.buffer.read() + if stdin: + args.filenames = lang_base.from_nul_delimited_filenames(stdin) + flags = re.IGNORECASE if args.ignore_case else 0 if args.multiline: flags |= re.MULTILINE | re.DOTALL diff --git a/pre_commit/languages/r.py b/pre_commit/languages/r.py index f70d2fdc..ec9ff493 100644 --- a/pre_commit/languages/r.py +++ b/pre_commit/languages/r.py @@ -268,6 +268,7 @@ def run_hook( is_local: bool, require_serial: bool, color: bool, + pass_filenames_via_stdin: bool = False, ) -> tuple[int, bytes]: cmd = _cmd_from_hook(prefix, entry, args, is_local=is_local) return lang_base.run_xargs( @@ -275,4 +276,5 @@ def run_hook( file_args, require_serial=require_serial, color=color, + pass_filenames_via_stdin=pass_filenames_via_stdin, ) diff --git a/pre_commit/languages/unsupported_script.py b/pre_commit/languages/unsupported_script.py index 1eaa1e27..2def776b 100644 --- a/pre_commit/languages/unsupported_script.py +++ b/pre_commit/languages/unsupported_script.py @@ -21,6 +21,7 @@ def run_hook( is_local: bool, require_serial: bool, color: bool, + pass_filenames_via_stdin: bool = False, ) -> tuple[int, bytes]: cmd = lang_base.hook_cmd(entry, args) cmd = (prefix.path(cmd[0]), *cmd[1:]) @@ -29,4 +30,5 @@ def run_hook( file_args, require_serial=require_serial, color=color, + pass_filenames_via_stdin=pass_filenames_via_stdin, ) diff --git a/pre_commit/util.py b/pre_commit/util.py index 19b1880b..47c49f7e 100644 --- a/pre_commit/util.py +++ b/pre_commit/util.py @@ -92,6 +92,7 @@ def cmd_output_b( check: bool = True, **kwargs: Any, ) -> tuple[int, bytes, bytes | None]: + input_data = kwargs.pop('input', None) _setdefault_kwargs(kwargs) try: @@ -104,7 +105,7 @@ def cmd_output_b( except OSError as e: returncode, stdout_b, stderr_b = _oserror_to_output(e) else: - stdout_b, stderr_b = proc.communicate() + stdout_b, stderr_b = proc.communicate(input_data) returncode = proc.returncode if check and returncode: diff --git a/pre_commit/xargs.py b/pre_commit/xargs.py index 7c98d167..a61cdeb8 100644 --- a/pre_commit/xargs.py +++ b/pre_commit/xargs.py @@ -142,7 +142,8 @@ def xargs( color: Make a pty if on a platform that supports it target_concurrency: Target number of partitions to run concurrently """ - cmd_fn = cmd_output_p if color else cmd_output_b + use_pty = color and kwargs.get('input') is None + cmd_fn = cmd_output_p if use_pty else cmd_output_b retcode = 0 stdout = b'' @@ -164,6 +165,8 @@ def xargs( _max_length = 8192 - len(cmd_exe) - len(' /c ') - 1024 partitions = partition(cmd, varargs, target_concurrency, _max_length) + if kwargs.get('input') is not None and len(partitions) != 1: + raise AssertionError('`input` is only supported with one partition') def run_cmd_partition( run_cmd: tuple[str, ...], diff --git a/tests/clientlib_test.py b/tests/clientlib_test.py index 2c42b80c..3a93e8bf 100644 --- a/tests/clientlib_test.py +++ b/tests/clientlib_test.py @@ -572,6 +572,18 @@ def test_manifest_stages_defaulting(): ] +def test_manifest_pass_filenames_via_stdin_defaulting(): + dct = { + 'id': 'fake-hook', + 'name': 'fake-hook', + 'entry': 'fake-hook', + 'language': 'system', + } + cfgv.validate(dct, MANIFEST_HOOK_DICT) + dct = cfgv.apply_defaults(dct, MANIFEST_HOOK_DICT) + assert dct['pass_filenames_via_stdin'] is False + + def test_config_hook_stages_defaulting_missing(): dct = {'id': 'fake-hook'} cfgv.validate(dct, CONFIG_HOOK_DICT) diff --git a/tests/commands/run_test.py b/tests/commands/run_test.py index e4af1e16..7c203e37 100644 --- a/tests/commands/run_test.py +++ b/tests/commands/run_test.py @@ -1064,6 +1064,35 @@ def test_pass_filenames( assert (b'foo.py' in printed) == pass_filenames +def test_pass_filenames_via_stdin(cap_out, store, repo_with_passing_hook): + config = { + 'repo': 'local', + 'hooks': [{ + 'id': 'filenames-via-stdin', + 'name': 'filenames-via-stdin', + 'entry': ( + f'{shlex.quote(sys.executable)} -c ' + '\'import sys; ' + 'print(repr(sys.argv[1:])); ' + 'print(repr(sys.stdin.buffer.read()))\'' + ), + 'language': 'system', + 'pass_filenames_via_stdin': True, + }], + } + add_config_to_repo(repo_with_passing_hook, config) + stage_a_file() + + ret, printed = _do_run( + cap_out, store, repo_with_passing_hook, + run_opts(hook='filenames-via-stdin', verbose=True), + ) + + assert ret == 0 + assert b'[]' in printed + assert b"b'foo.py\\x00'" in printed + + def test_fail_fast(cap_out, store, repo_with_failing_hook): with modify_config() as config: # More than one hook diff --git a/tests/lang_base_test.py b/tests/lang_base_test.py index 9fac83da..de9334e8 100644 --- a/tests/lang_base_test.py +++ b/tests/lang_base_test.py @@ -1,6 +1,7 @@ from __future__ import annotations import os.path +import shlex import sys from unittest import mock @@ -166,6 +167,42 @@ def test_basic_run_hook(tmp_path): assert out == b'hi hello file file file\n' +def test_basic_run_hook_passes_filenames_via_stdin(tmp_path): + ret, out = lang_base.basic_run_hook( + Prefix(tmp_path), + ( + f'{shlex.quote(sys.executable)} -c ' + '\'import sys; ' + 'print(repr(sys.argv[1:])); ' + 'print(repr(sys.stdin.buffer.read()))\'' + ), + (), + ['file1', 'file2'], + is_local=False, + require_serial=False, + color=False, + pass_filenames_via_stdin=True, + ) + assert ret == 0 + out = out.replace(b'\r\n', b'\n') + assert out == b"[]\nb'file1\\x00file2\\x00'\n" + + +def test_to_nul_delimited_filenames(): + ret = lang_base.to_nul_delimited_filenames(('file1', 'file2')) + assert ret == b'file1\x00file2\x00' + + +def test_to_nul_delimited_filenames_empty(): + ret = lang_base.to_nul_delimited_filenames(()) + assert ret == b'' + + +def test_from_nul_delimited_filenames(): + ret = lang_base.from_nul_delimited_filenames(b'file1\x00file2\x00') + assert ret == ['file1', 'file2'] + + def test_hook_cmd(): assert lang_base.hook_cmd('echo hi', ()) == ('echo', 'hi') diff --git a/tests/languages/pygrep_test.py b/tests/languages/pygrep_test.py index c6271c80..ef52bfb6 100644 --- a/tests/languages/pygrep_test.py +++ b/tests/languages/pygrep_test.py @@ -1,5 +1,8 @@ from __future__ import annotations +import sys +from unittest import mock + import pytest from pre_commit.languages import pygrep @@ -138,6 +141,16 @@ def test_grep_hook_matching(some_files, tmp_path): assert ret == (1, b"f7:1:hello'hi\n") +@pytest.mark.usefixtures('some_files') +def test_main_reads_nul_delimited_filenames_from_stdin(cap_out): + with mock.patch.object(sys.stdin.buffer, 'read', return_value=b'f1\x00f2\x00'): + ret = pygrep.main(('foo',)) + + out = cap_out.get() + assert ret == 1 + assert out == 'f1:1:foo\n' + + @pytest.mark.parametrize('regex', ('nope', "foo'bar", r'^\[INFO\]')) def test_grep_hook_not_matching(regex, some_files, tmp_path): ret = run_language(tmp_path, pygrep, regex, file_args=('f7', 'f8', 'f9')) diff --git a/tests/repository_test.py b/tests/repository_test.py index 5d71c3e4..c06fea86 100644 --- a/tests/repository_test.py +++ b/tests/repository_test.py @@ -430,6 +430,7 @@ def test_manifest_hooks(tempdir_factory, store): minimum_pre_commit_version='0', name='Bash hook', pass_filenames=True, + pass_filenames_via_stdin=False, require_serial=False, stages=[ 'commit-msg',