pre-commit/tests/languages/pygrep_test.py
Sharmila Jesupaul 635912514d add pass_filenames_via_stdin for large changesets
pre-commit currently passes selected filenames to hooks via argv.
For large changesets (or --all-files), argv length limits are hit and
filenames are partitioned, causing multiple hook invocations.

This means there is currently no built-in way to pass filenames to an
underlying hook in one shot without chunking / re-running. The only practical
workaround is to set pass_filenames: false and run custom git operations in
hook code to reconstruct the file set, which is expensive and duplicates
pre-commit's own file-selection logic.

This change adds a hook option:

    pass_filenames_via_stdin: true

When enabled, pre-commit sends filenames as NUL-delimited bytes on stdin and
runs the hook in a single invocation (no argv chunking).

Why NUL-delimited stdin:
- safe for filenames containing spaces/newlines
- matches established -0 conventions in unix tooling

Usage for hook authors:
- shell:

    while IFS= read -r -d '' filename; do
        ...
    done

- python:

    data = sys.stdin.buffer.read()
    filenames = [os.fsdecode(p) for p in data.split(b'\0') if p]

Behavior notes:
- default remains argv-based passing
- pass_filenames: false still disables filename passing entirely

Implementation includes schema/runtime wiring, shared NUL encode/decode
helpers, and tests covering defaulting and runtime behavior.
2026-02-18 18:06:34 -08:00

157 lines
4.5 KiB
Python

from __future__ import annotations
import sys
from unittest import mock
import pytest
from pre_commit.languages import pygrep
from testing.language_helpers import run_language
@pytest.fixture
def some_files(tmpdir):
tmpdir.join('f1').write_binary(b'foo\nbar\n')
tmpdir.join('f2').write_binary(b'[INFO] hi\n')
tmpdir.join('f3').write_binary(b"with'quotes\n")
tmpdir.join('f4').write_binary(b'foo\npattern\nbar\n')
tmpdir.join('f5').write_binary(b'[INFO] hi\npattern\nbar')
tmpdir.join('f6').write_binary(b"pattern\nbarwith'foo\n")
tmpdir.join('f7').write_binary(b"hello'hi\nworld\n")
tmpdir.join('f8').write_binary(b'foo\nbar\nbaz\n')
tmpdir.join('f9').write_binary(b'[WARN] hi\n')
with tmpdir.as_cwd():
yield
@pytest.mark.usefixtures('some_files')
@pytest.mark.parametrize(
('pattern', 'expected_retcode', 'expected_out'),
(
('baz', 0, ''),
('foo', 1, 'f1:1:foo\n'),
('bar', 1, 'f1:2:bar\n'),
(r'(?i)\[info\]', 1, 'f2:1:[INFO] hi\n'),
("h'q", 1, "f3:1:with'quotes\n"),
),
)
def test_main(cap_out, pattern, expected_retcode, expected_out):
ret = pygrep.main((pattern, 'f1', 'f2', 'f3'))
out = cap_out.get()
assert ret == expected_retcode
assert out == expected_out
@pytest.mark.usefixtures('some_files')
def test_negate_by_line_no_match(cap_out):
ret = pygrep.main(('pattern\nbar', 'f4', 'f5', 'f6', '--negate'))
out = cap_out.get()
assert ret == 1
assert out == 'f4\nf5\nf6\n'
@pytest.mark.usefixtures('some_files')
def test_negate_by_line_two_match(cap_out):
ret = pygrep.main(('foo', 'f4', 'f5', 'f6', '--negate'))
out = cap_out.get()
assert ret == 1
assert out == 'f5\n'
@pytest.mark.usefixtures('some_files')
def test_negate_by_line_all_match(cap_out):
ret = pygrep.main(('pattern', 'f4', 'f5', 'f6', '--negate'))
out = cap_out.get()
assert ret == 0
assert out == ''
@pytest.mark.usefixtures('some_files')
def test_negate_by_file_no_match(cap_out):
ret = pygrep.main(('baz', 'f4', 'f5', 'f6', '--negate', '--multiline'))
out = cap_out.get()
assert ret == 1
assert out == 'f4\nf5\nf6\n'
@pytest.mark.usefixtures('some_files')
def test_negate_by_file_one_match(cap_out):
ret = pygrep.main(
('foo\npattern', 'f4', 'f5', 'f6', '--negate', '--multiline'),
)
out = cap_out.get()
assert ret == 1
assert out == 'f5\nf6\n'
@pytest.mark.usefixtures('some_files')
def test_negate_by_file_all_match(cap_out):
ret = pygrep.main(
('pattern\nbar', 'f4', 'f5', 'f6', '--negate', '--multiline'),
)
out = cap_out.get()
assert ret == 0
assert out == ''
@pytest.mark.usefixtures('some_files')
def test_ignore_case(cap_out):
ret = pygrep.main(('--ignore-case', 'info', 'f1', 'f2', 'f3'))
out = cap_out.get()
assert ret == 1
assert out == 'f2:1:[INFO] hi\n'
@pytest.mark.usefixtures('some_files')
def test_multiline(cap_out):
ret = pygrep.main(('--multiline', r'foo\nbar', 'f1', 'f2', 'f3'))
out = cap_out.get()
assert ret == 1
assert out == 'f1:1:foo\nbar\n'
@pytest.mark.usefixtures('some_files')
def test_multiline_line_number(cap_out):
ret = pygrep.main(('--multiline', r'ar', 'f1', 'f2', 'f3'))
out = cap_out.get()
assert ret == 1
assert out == 'f1:2:bar\n'
@pytest.mark.usefixtures('some_files')
def test_multiline_dotall_flag_is_enabled(cap_out):
ret = pygrep.main(('--multiline', r'o.*bar', 'f1', 'f2', 'f3'))
out = cap_out.get()
assert ret == 1
assert out == 'f1:1:foo\nbar\n'
@pytest.mark.usefixtures('some_files')
def test_multiline_multiline_flag_is_enabled(cap_out):
ret = pygrep.main(('--multiline', r'foo$.*bar', 'f1', 'f2', 'f3'))
out = cap_out.get()
assert ret == 1
assert out == 'f1:1:foo\nbar\n'
def test_grep_hook_matching(some_files, tmp_path):
ret = run_language(
tmp_path, pygrep, 'ello', file_args=('f7', 'f8', 'f9'),
)
assert ret == (1, b"f7:1:hello'hi\n")
@pytest.mark.usefixtures('some_files')
def test_main_reads_nul_delimited_filenames_from_stdin(cap_out):
with mock.patch.object(sys.stdin.buffer, 'read', return_value=b'f1\x00f2\x00'):
ret = pygrep.main(('foo',))
out = cap_out.get()
assert ret == 1
assert out == 'f1:1:foo\n'
@pytest.mark.parametrize('regex', ('nope', "foo'bar", r'^\[INFO\]'))
def test_grep_hook_not_matching(regex, some_files, tmp_path):
ret = run_language(tmp_path, pygrep, regex, file_args=('f7', 'f8', 'f9'))
assert ret == (0, b'')