add pass_filenames_via_stdin for large changesets

pre-commit currently passes selected filenames to hooks via argv.
For large changesets (or --all-files), argv length limits are hit and
filenames are partitioned, causing multiple hook invocations.

This means there is currently no built-in way to pass filenames to an
underlying hook in one shot without chunking / re-running. The only practical
workaround is to set pass_filenames: false and run custom git operations in
hook code to reconstruct the file set, which is expensive and duplicates
pre-commit's own file-selection logic.

This change adds a hook option:

    pass_filenames_via_stdin: true

When enabled, pre-commit sends filenames as NUL-delimited bytes on stdin and
runs the hook in a single invocation (no argv chunking).

Why NUL-delimited stdin:
- safe for filenames containing spaces/newlines
- matches established -0 conventions in unix tooling

Usage for hook authors:
- shell:

    while IFS= read -r -d '' filename; do
        ...
    done

- python:

    data = sys.stdin.buffer.read()
    filenames = [os.fsdecode(p) for p in data.split(b'\0') if p]

Behavior notes:
- default remains argv-based passing
- pass_filenames: false still disables filename passing entirely

Implementation includes schema/runtime wiring, shared NUL encode/decode
helpers, and tests covering defaulting and runtime behavior.
This commit is contained in:
Sharmila Jesupaul 2026-02-18 18:01:55 -08:00
parent 8416413a0e
commit 635912514d
18 changed files with 147 additions and 2 deletions

View file

@ -572,6 +572,18 @@ def test_manifest_stages_defaulting():
]
def test_manifest_pass_filenames_via_stdin_defaulting():
dct = {
'id': 'fake-hook',
'name': 'fake-hook',
'entry': 'fake-hook',
'language': 'system',
}
cfgv.validate(dct, MANIFEST_HOOK_DICT)
dct = cfgv.apply_defaults(dct, MANIFEST_HOOK_DICT)
assert dct['pass_filenames_via_stdin'] is False
def test_config_hook_stages_defaulting_missing():
dct = {'id': 'fake-hook'}
cfgv.validate(dct, CONFIG_HOOK_DICT)

View file

@ -1064,6 +1064,35 @@ def test_pass_filenames(
assert (b'foo.py' in printed) == pass_filenames
def test_pass_filenames_via_stdin(cap_out, store, repo_with_passing_hook):
config = {
'repo': 'local',
'hooks': [{
'id': 'filenames-via-stdin',
'name': 'filenames-via-stdin',
'entry': (
f'{shlex.quote(sys.executable)} -c '
'\'import sys; '
'print(repr(sys.argv[1:])); '
'print(repr(sys.stdin.buffer.read()))\''
),
'language': 'system',
'pass_filenames_via_stdin': True,
}],
}
add_config_to_repo(repo_with_passing_hook, config)
stage_a_file()
ret, printed = _do_run(
cap_out, store, repo_with_passing_hook,
run_opts(hook='filenames-via-stdin', verbose=True),
)
assert ret == 0
assert b'[]' in printed
assert b"b'foo.py\\x00'" in printed
def test_fail_fast(cap_out, store, repo_with_failing_hook):
with modify_config() as config:
# More than one hook

View file

@ -1,6 +1,7 @@
from __future__ import annotations
import os.path
import shlex
import sys
from unittest import mock
@ -166,6 +167,42 @@ def test_basic_run_hook(tmp_path):
assert out == b'hi hello file file file\n'
def test_basic_run_hook_passes_filenames_via_stdin(tmp_path):
ret, out = lang_base.basic_run_hook(
Prefix(tmp_path),
(
f'{shlex.quote(sys.executable)} -c '
'\'import sys; '
'print(repr(sys.argv[1:])); '
'print(repr(sys.stdin.buffer.read()))\''
),
(),
['file1', 'file2'],
is_local=False,
require_serial=False,
color=False,
pass_filenames_via_stdin=True,
)
assert ret == 0
out = out.replace(b'\r\n', b'\n')
assert out == b"[]\nb'file1\\x00file2\\x00'\n"
def test_to_nul_delimited_filenames():
ret = lang_base.to_nul_delimited_filenames(('file1', 'file2'))
assert ret == b'file1\x00file2\x00'
def test_to_nul_delimited_filenames_empty():
ret = lang_base.to_nul_delimited_filenames(())
assert ret == b''
def test_from_nul_delimited_filenames():
ret = lang_base.from_nul_delimited_filenames(b'file1\x00file2\x00')
assert ret == ['file1', 'file2']
def test_hook_cmd():
assert lang_base.hook_cmd('echo hi', ()) == ('echo', 'hi')

View file

@ -1,5 +1,8 @@
from __future__ import annotations
import sys
from unittest import mock
import pytest
from pre_commit.languages import pygrep
@ -138,6 +141,16 @@ def test_grep_hook_matching(some_files, tmp_path):
assert ret == (1, b"f7:1:hello'hi\n")
@pytest.mark.usefixtures('some_files')
def test_main_reads_nul_delimited_filenames_from_stdin(cap_out):
with mock.patch.object(sys.stdin.buffer, 'read', return_value=b'f1\x00f2\x00'):
ret = pygrep.main(('foo',))
out = cap_out.get()
assert ret == 1
assert out == 'f1:1:foo\n'
@pytest.mark.parametrize('regex', ('nope', "foo'bar", r'^\[INFO\]'))
def test_grep_hook_not_matching(regex, some_files, tmp_path):
ret = run_language(tmp_path, pygrep, regex, file_args=('f7', 'f8', 'f9'))

View file

@ -430,6 +430,7 @@ def test_manifest_hooks(tempdir_factory, store):
minimum_pre_commit_version='0',
name='Bash hook',
pass_filenames=True,
pass_filenames_via_stdin=False,
require_serial=False,
stages=[
'commit-msg',