add pass_filenames_via_stdin for large changesets

pre-commit currently passes selected filenames to hooks via argv.
For large changesets (or --all-files), argv length limits are hit and
filenames are partitioned, causing multiple hook invocations.

This means there is currently no built-in way to pass filenames to an
underlying hook in one shot without chunking / re-running. The only practical
workaround is to set pass_filenames: false and run custom git operations in
hook code to reconstruct the file set, which is expensive and duplicates
pre-commit's own file-selection logic.

This change adds a hook option:

    pass_filenames_via_stdin: true

When enabled, pre-commit sends filenames as NUL-delimited bytes on stdin and
runs the hook in a single invocation (no argv chunking).

Why NUL-delimited stdin:
- safe for filenames containing spaces/newlines
- matches established -0 conventions in unix tooling

Usage for hook authors:
- shell:

    while IFS= read -r -d '' filename; do
        ...
    done

- python:

    data = sys.stdin.buffer.read()
    filenames = [os.fsdecode(p) for p in data.split(b'\0') if p]

Behavior notes:
- default remains argv-based passing
- pass_filenames: false still disables filename passing entirely

Implementation includes schema/runtime wiring, shared NUL encode/decode
helpers, and tests covering defaulting and runtime behavior.
This commit is contained in:
Sharmila Jesupaul 2026-02-18 18:01:55 -08:00
parent 8416413a0e
commit 635912514d
18 changed files with 147 additions and 2 deletions

View file

@ -256,6 +256,7 @@ MANIFEST_HOOK_DICT = cfgv.Map(
cfgv.Optional('always_run', cfgv.check_bool, False), cfgv.Optional('always_run', cfgv.check_bool, False),
cfgv.Optional('fail_fast', cfgv.check_bool, False), cfgv.Optional('fail_fast', cfgv.check_bool, False),
cfgv.Optional('pass_filenames', cfgv.check_bool, True), cfgv.Optional('pass_filenames', cfgv.check_bool, True),
cfgv.Optional('pass_filenames_via_stdin', cfgv.check_bool, False),
cfgv.Optional('description', cfgv.check_string, ''), cfgv.Optional('description', cfgv.check_string, ''),
cfgv.Optional('language_version', cfgv.check_string, C.DEFAULT), cfgv.Optional('language_version', cfgv.check_string, C.DEFAULT),
cfgv.Optional('log_file', cfgv.check_string, ''), cfgv.Optional('log_file', cfgv.check_string, ''),

View file

@ -185,8 +185,10 @@ def _run_single_hook(
# print hook and dots first in case the hook takes a while to run # print hook and dots first in case the hook takes a while to run
output.write(_start_msg(start=hook.name, end_len=6, cols=cols)) output.write(_start_msg(start=hook.name, end_len=6, cols=cols))
pass_filenames_via_stdin = hook.pass_filenames_via_stdin
if not hook.pass_filenames: if not hook.pass_filenames:
filenames = () filenames = ()
pass_filenames_via_stdin = False
time_before = time.monotonic() time_before = time.monotonic()
language = languages[hook.language] language = languages[hook.language]
with language.in_env(hook.prefix, hook.language_version): with language.in_env(hook.prefix, hook.language_version):
@ -198,6 +200,7 @@ def _run_single_hook(
is_local=hook.src == 'local', is_local=hook.src == 'local',
require_serial=hook.require_serial, require_serial=hook.require_serial,
color=use_color, color=use_color,
pass_filenames_via_stdin=pass_filenames_via_stdin,
) )
duration = round(time.monotonic() - time_before, 2) or 0 duration = round(time.monotonic() - time_before, 2) or 0
diff_after = _get_diff() diff_after = _get_diff()

View file

@ -28,6 +28,7 @@ class Hook(NamedTuple):
always_run: bool always_run: bool
fail_fast: bool fail_fast: bool
pass_filenames: bool pass_filenames: bool
pass_filenames_via_stdin: bool
description: str description: str
language_version: str language_version: str
log_file: str log_file: str

View file

@ -22,6 +22,7 @@ from pre_commit.util import cmd_output_b
FIXED_RANDOM_SEED = 1542676187 FIXED_RANDOM_SEED = 1542676187
SHIMS_RE = re.compile(r'[/\\]shims[/\\]') SHIMS_RE = re.compile(r'[/\\]shims[/\\]')
NUL = b'\0'
class Language(Protocol): class Language(Protocol):
@ -56,6 +57,7 @@ class Language(Protocol):
is_local: bool, is_local: bool,
require_serial: bool, require_serial: bool,
color: bool, color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]: ) -> tuple[int, bytes]:
... ...
@ -153,13 +155,33 @@ def _shuffled(seq: Sequence[str]) -> list[str]:
return seq return seq
def to_nul_delimited_filenames(file_args: Sequence[str]) -> bytes:
ret = NUL.join(os.fsencode(filename) for filename in file_args)
return ret + NUL if ret else ret
def from_nul_delimited_filenames(filenames: bytes) -> list[str]:
return [os.fsdecode(part) for part in filenames.split(NUL) if part]
def run_xargs( def run_xargs(
cmd: tuple[str, ...], cmd: tuple[str, ...],
file_args: Sequence[str], file_args: Sequence[str],
*, *,
require_serial: bool, require_serial: bool,
color: bool, color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]: ) -> tuple[int, bytes]:
if pass_filenames_via_stdin:
stdin = to_nul_delimited_filenames(file_args)
return xargs.xargs(
cmd,
(),
target_concurrency=1,
color=color,
input=stdin,
)
if require_serial: if require_serial:
jobs = 1 jobs = 1
else: else:
@ -187,10 +209,12 @@ def basic_run_hook(
is_local: bool, is_local: bool,
require_serial: bool, require_serial: bool,
color: bool, color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]: ) -> tuple[int, bytes]:
return run_xargs( return run_xargs(
hook_cmd(entry, args), hook_cmd(entry, args),
file_args, file_args,
require_serial=require_serial, require_serial=require_serial,
color=color, color=color,
pass_filenames_via_stdin=pass_filenames_via_stdin,
) )

View file

@ -165,6 +165,7 @@ def run_hook(
is_local: bool, is_local: bool,
require_serial: bool, require_serial: bool,
color: bool, color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]: # pragma: win32 no cover ) -> tuple[int, bytes]: # pragma: win32 no cover
# Rebuild the docker image in case it has gone missing, as many people do # Rebuild the docker image in case it has gone missing, as many people do
# automated cleanup of docker images. # automated cleanup of docker images.
@ -178,4 +179,5 @@ def run_hook(
file_args, file_args,
require_serial=require_serial, require_serial=require_serial,
color=color, color=color,
pass_filenames_via_stdin=pass_filenames_via_stdin,
) )

View file

@ -22,6 +22,7 @@ def run_hook(
is_local: bool, is_local: bool,
require_serial: bool, require_serial: bool,
color: bool, color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]: # pragma: win32 no cover ) -> tuple[int, bytes]: # pragma: win32 no cover
cmd = docker_cmd(color=color) + lang_base.hook_cmd(entry, args) cmd = docker_cmd(color=color) + lang_base.hook_cmd(entry, args)
return lang_base.run_xargs( return lang_base.run_xargs(
@ -29,4 +30,5 @@ def run_hook(
file_args, file_args,
require_serial=require_serial, require_serial=require_serial,
color=color, color=color,
pass_filenames_via_stdin=pass_filenames_via_stdin,
) )

View file

@ -21,6 +21,7 @@ def run_hook(
is_local: bool, is_local: bool,
require_serial: bool, require_serial: bool,
color: bool, color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]: ) -> tuple[int, bytes]:
out = f'{entry}\n\n'.encode() out = f'{entry}\n\n'.encode()
out += b'\n'.join(f.encode() for f in file_args) + b'\n' out += b'\n'.join(f.encode() for f in file_args) + b'\n'

View file

@ -27,6 +27,7 @@ def run_hook(
is_local: bool, is_local: bool,
require_serial: bool, require_serial: bool,
color: bool, color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]: ) -> tuple[int, bytes]:
# `entry` is a (hook-repo relative) file followed by (optional) args, e.g. # `entry` is a (hook-repo relative) file followed by (optional) args, e.g.
# `bin/id.jl` or `bin/hook.jl --arg1 --arg2` so we # `bin/id.jl` or `bin/hook.jl --arg1 --arg2` so we
@ -43,6 +44,7 @@ def run_hook(
file_args, file_args,
require_serial=require_serial, require_serial=require_serial,
color=color, color=color,
pass_filenames_via_stdin=pass_filenames_via_stdin,
) )

View file

@ -96,8 +96,12 @@ def run_hook(
is_local: bool, is_local: bool,
require_serial: bool, require_serial: bool,
color: bool, color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]: ) -> tuple[int, bytes]:
cmd = (sys.executable, '-m', __name__, *args, entry) cmd = (sys.executable, '-m', __name__, *args, entry)
if pass_filenames_via_stdin:
stdin = lang_base.to_nul_delimited_filenames(file_args)
return xargs(cmd, (), color=color, input=stdin)
return xargs(cmd, file_args, color=color) return xargs(cmd, file_args, color=color)
@ -116,6 +120,11 @@ def main(argv: Sequence[str] | None = None) -> int:
parser.add_argument('filenames', nargs='*') parser.add_argument('filenames', nargs='*')
args = parser.parse_args(argv) args = parser.parse_args(argv)
if not args.filenames:
stdin = sys.stdin.buffer.read()
if stdin:
args.filenames = lang_base.from_nul_delimited_filenames(stdin)
flags = re.IGNORECASE if args.ignore_case else 0 flags = re.IGNORECASE if args.ignore_case else 0
if args.multiline: if args.multiline:
flags |= re.MULTILINE | re.DOTALL flags |= re.MULTILINE | re.DOTALL

View file

@ -268,6 +268,7 @@ def run_hook(
is_local: bool, is_local: bool,
require_serial: bool, require_serial: bool,
color: bool, color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]: ) -> tuple[int, bytes]:
cmd = _cmd_from_hook(prefix, entry, args, is_local=is_local) cmd = _cmd_from_hook(prefix, entry, args, is_local=is_local)
return lang_base.run_xargs( return lang_base.run_xargs(
@ -275,4 +276,5 @@ def run_hook(
file_args, file_args,
require_serial=require_serial, require_serial=require_serial,
color=color, color=color,
pass_filenames_via_stdin=pass_filenames_via_stdin,
) )

View file

@ -21,6 +21,7 @@ def run_hook(
is_local: bool, is_local: bool,
require_serial: bool, require_serial: bool,
color: bool, color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]: ) -> tuple[int, bytes]:
cmd = lang_base.hook_cmd(entry, args) cmd = lang_base.hook_cmd(entry, args)
cmd = (prefix.path(cmd[0]), *cmd[1:]) cmd = (prefix.path(cmd[0]), *cmd[1:])
@ -29,4 +30,5 @@ def run_hook(
file_args, file_args,
require_serial=require_serial, require_serial=require_serial,
color=color, color=color,
pass_filenames_via_stdin=pass_filenames_via_stdin,
) )

View file

@ -92,6 +92,7 @@ def cmd_output_b(
check: bool = True, check: bool = True,
**kwargs: Any, **kwargs: Any,
) -> tuple[int, bytes, bytes | None]: ) -> tuple[int, bytes, bytes | None]:
input_data = kwargs.pop('input', None)
_setdefault_kwargs(kwargs) _setdefault_kwargs(kwargs)
try: try:
@ -104,7 +105,7 @@ def cmd_output_b(
except OSError as e: except OSError as e:
returncode, stdout_b, stderr_b = _oserror_to_output(e) returncode, stdout_b, stderr_b = _oserror_to_output(e)
else: else:
stdout_b, stderr_b = proc.communicate() stdout_b, stderr_b = proc.communicate(input_data)
returncode = proc.returncode returncode = proc.returncode
if check and returncode: if check and returncode:

View file

@ -142,7 +142,8 @@ def xargs(
color: Make a pty if on a platform that supports it color: Make a pty if on a platform that supports it
target_concurrency: Target number of partitions to run concurrently target_concurrency: Target number of partitions to run concurrently
""" """
cmd_fn = cmd_output_p if color else cmd_output_b use_pty = color and kwargs.get('input') is None
cmd_fn = cmd_output_p if use_pty else cmd_output_b
retcode = 0 retcode = 0
stdout = b'' stdout = b''
@ -164,6 +165,8 @@ def xargs(
_max_length = 8192 - len(cmd_exe) - len(' /c ') - 1024 _max_length = 8192 - len(cmd_exe) - len(' /c ') - 1024
partitions = partition(cmd, varargs, target_concurrency, _max_length) partitions = partition(cmd, varargs, target_concurrency, _max_length)
if kwargs.get('input') is not None and len(partitions) != 1:
raise AssertionError('`input` is only supported with one partition')
def run_cmd_partition( def run_cmd_partition(
run_cmd: tuple[str, ...], run_cmd: tuple[str, ...],

View file

@ -572,6 +572,18 @@ def test_manifest_stages_defaulting():
] ]
def test_manifest_pass_filenames_via_stdin_defaulting():
dct = {
'id': 'fake-hook',
'name': 'fake-hook',
'entry': 'fake-hook',
'language': 'system',
}
cfgv.validate(dct, MANIFEST_HOOK_DICT)
dct = cfgv.apply_defaults(dct, MANIFEST_HOOK_DICT)
assert dct['pass_filenames_via_stdin'] is False
def test_config_hook_stages_defaulting_missing(): def test_config_hook_stages_defaulting_missing():
dct = {'id': 'fake-hook'} dct = {'id': 'fake-hook'}
cfgv.validate(dct, CONFIG_HOOK_DICT) cfgv.validate(dct, CONFIG_HOOK_DICT)

View file

@ -1064,6 +1064,35 @@ def test_pass_filenames(
assert (b'foo.py' in printed) == pass_filenames assert (b'foo.py' in printed) == pass_filenames
def test_pass_filenames_via_stdin(cap_out, store, repo_with_passing_hook):
config = {
'repo': 'local',
'hooks': [{
'id': 'filenames-via-stdin',
'name': 'filenames-via-stdin',
'entry': (
f'{shlex.quote(sys.executable)} -c '
'\'import sys; '
'print(repr(sys.argv[1:])); '
'print(repr(sys.stdin.buffer.read()))\''
),
'language': 'system',
'pass_filenames_via_stdin': True,
}],
}
add_config_to_repo(repo_with_passing_hook, config)
stage_a_file()
ret, printed = _do_run(
cap_out, store, repo_with_passing_hook,
run_opts(hook='filenames-via-stdin', verbose=True),
)
assert ret == 0
assert b'[]' in printed
assert b"b'foo.py\\x00'" in printed
def test_fail_fast(cap_out, store, repo_with_failing_hook): def test_fail_fast(cap_out, store, repo_with_failing_hook):
with modify_config() as config: with modify_config() as config:
# More than one hook # More than one hook

View file

@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
import os.path import os.path
import shlex
import sys import sys
from unittest import mock from unittest import mock
@ -166,6 +167,42 @@ def test_basic_run_hook(tmp_path):
assert out == b'hi hello file file file\n' assert out == b'hi hello file file file\n'
def test_basic_run_hook_passes_filenames_via_stdin(tmp_path):
ret, out = lang_base.basic_run_hook(
Prefix(tmp_path),
(
f'{shlex.quote(sys.executable)} -c '
'\'import sys; '
'print(repr(sys.argv[1:])); '
'print(repr(sys.stdin.buffer.read()))\''
),
(),
['file1', 'file2'],
is_local=False,
require_serial=False,
color=False,
pass_filenames_via_stdin=True,
)
assert ret == 0
out = out.replace(b'\r\n', b'\n')
assert out == b"[]\nb'file1\\x00file2\\x00'\n"
def test_to_nul_delimited_filenames():
ret = lang_base.to_nul_delimited_filenames(('file1', 'file2'))
assert ret == b'file1\x00file2\x00'
def test_to_nul_delimited_filenames_empty():
ret = lang_base.to_nul_delimited_filenames(())
assert ret == b''
def test_from_nul_delimited_filenames():
ret = lang_base.from_nul_delimited_filenames(b'file1\x00file2\x00')
assert ret == ['file1', 'file2']
def test_hook_cmd(): def test_hook_cmd():
assert lang_base.hook_cmd('echo hi', ()) == ('echo', 'hi') assert lang_base.hook_cmd('echo hi', ()) == ('echo', 'hi')

View file

@ -1,5 +1,8 @@
from __future__ import annotations from __future__ import annotations
import sys
from unittest import mock
import pytest import pytest
from pre_commit.languages import pygrep from pre_commit.languages import pygrep
@ -138,6 +141,16 @@ def test_grep_hook_matching(some_files, tmp_path):
assert ret == (1, b"f7:1:hello'hi\n") assert ret == (1, b"f7:1:hello'hi\n")
@pytest.mark.usefixtures('some_files')
def test_main_reads_nul_delimited_filenames_from_stdin(cap_out):
with mock.patch.object(sys.stdin.buffer, 'read', return_value=b'f1\x00f2\x00'):
ret = pygrep.main(('foo',))
out = cap_out.get()
assert ret == 1
assert out == 'f1:1:foo\n'
@pytest.mark.parametrize('regex', ('nope', "foo'bar", r'^\[INFO\]')) @pytest.mark.parametrize('regex', ('nope', "foo'bar", r'^\[INFO\]'))
def test_grep_hook_not_matching(regex, some_files, tmp_path): def test_grep_hook_not_matching(regex, some_files, tmp_path):
ret = run_language(tmp_path, pygrep, regex, file_args=('f7', 'f8', 'f9')) ret = run_language(tmp_path, pygrep, regex, file_args=('f7', 'f8', 'f9'))

View file

@ -430,6 +430,7 @@ def test_manifest_hooks(tempdir_factory, store):
minimum_pre_commit_version='0', minimum_pre_commit_version='0',
name='Bash hook', name='Bash hook',
pass_filenames=True, pass_filenames=True,
pass_filenames_via_stdin=False,
require_serial=False, require_serial=False,
stages=[ stages=[
'commit-msg', 'commit-msg',