pre-commit/pre_commit/lang_base.py
Sharmila Jesupaul 635912514d add pass_filenames_via_stdin for large changesets
pre-commit currently passes selected filenames to hooks via argv.
For large changesets (or --all-files), argv length limits are hit and
filenames are partitioned, causing multiple hook invocations.

This means there is currently no built-in way to pass filenames to an
underlying hook in one shot without chunking / re-running. The only practical
workaround is to set pass_filenames: false and run custom git operations in
hook code to reconstruct the file set, which is expensive and duplicates
pre-commit's own file-selection logic.

This change adds a hook option:

    pass_filenames_via_stdin: true

When enabled, pre-commit sends filenames as NUL-delimited bytes on stdin and
runs the hook in a single invocation (no argv chunking).

Why NUL-delimited stdin:
- safe for filenames containing spaces/newlines
- matches established -0 conventions in unix tooling

Usage for hook authors:
- shell:

    while IFS= read -r -d '' filename; do
        ...
    done

- python:

    data = sys.stdin.buffer.read()
    filenames = [os.fsdecode(p) for p in data.split(b'\0') if p]

Behavior notes:
- default remains argv-based passing
- pass_filenames: false still disables filename passing entirely

Implementation includes schema/runtime wiring, shared NUL encode/decode
helpers, and tests covering defaulting and runtime behavior.
2026-02-18 18:06:34 -08:00

220 lines
6 KiB
Python

from __future__ import annotations
import contextlib
import os
import random
import re
import shlex
import sys
from collections.abc import Generator
from collections.abc import Sequence
from typing import Any
from typing import ContextManager
from typing import NoReturn
from typing import Protocol
import pre_commit.constants as C
from pre_commit import parse_shebang
from pre_commit import xargs
from pre_commit.prefix import Prefix
from pre_commit.util import cmd_output_b
FIXED_RANDOM_SEED = 1542676187
SHIMS_RE = re.compile(r'[/\\]shims[/\\]')
NUL = b'\0'
class Language(Protocol):
# Use `None` for no installation / environment
@property
def ENVIRONMENT_DIR(self) -> str | None: ...
# return a value to replace `'default` for `language_version`
def get_default_version(self) -> str: ...
# return whether the environment is healthy (or should be rebuilt)
def health_check(self, prefix: Prefix, version: str) -> str | None: ...
# install a repository for the given language and language_version
def install_environment(
self,
prefix: Prefix,
version: str,
additional_dependencies: Sequence[str],
) -> None:
...
# modify the environment for hook execution
def in_env(self, prefix: Prefix, version: str) -> ContextManager[None]: ...
# execute a hook and return the exit code and output
def run_hook(
self,
prefix: Prefix,
entry: str,
args: Sequence[str],
file_args: Sequence[str],
*,
is_local: bool,
require_serial: bool,
color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]:
...
def exe_exists(exe: str) -> bool:
found = parse_shebang.find_executable(exe)
if found is None: # exe exists
return False
homedir = os.path.expanduser('~')
try:
common: str | None = os.path.commonpath((found, homedir))
except ValueError: # on windows, different drives raises ValueError
common = None
return (
# it is not in a /shims/ directory
not SHIMS_RE.search(found) and
(
# the homedir is / (docker, service user, etc.)
os.path.dirname(homedir) == homedir or
# the exe is not contained in the home directory
common != homedir
)
)
def setup_cmd(prefix: Prefix, cmd: tuple[str, ...], **kwargs: Any) -> None:
cmd_output_b(*cmd, cwd=prefix.prefix_dir, **kwargs)
def environment_dir(prefix: Prefix, d: str, language_version: str) -> str:
return prefix.path(f'{d}-{language_version}')
def assert_version_default(binary: str, version: str) -> None:
if version != C.DEFAULT:
raise AssertionError(
f'for now, pre-commit requires system-installed {binary} -- '
f'you selected `language_version: {version}`',
)
def assert_no_additional_deps(
lang: str,
additional_deps: Sequence[str],
) -> None:
if additional_deps:
raise AssertionError(
f'for now, pre-commit does not support '
f'additional_dependencies for {lang} -- '
f'you selected `additional_dependencies: {additional_deps}`',
)
def basic_get_default_version() -> str:
return C.DEFAULT
def basic_health_check(prefix: Prefix, language_version: str) -> str | None:
return None
def no_install(
prefix: Prefix,
version: str,
additional_dependencies: Sequence[str],
) -> NoReturn:
raise AssertionError('This language is not installable')
@contextlib.contextmanager
def no_env(prefix: Prefix, version: str) -> Generator[None]:
yield
def target_concurrency() -> int:
if 'PRE_COMMIT_NO_CONCURRENCY' in os.environ:
return 1
else:
# Travis appears to have a bunch of CPUs, but we can't use them all.
if 'TRAVIS' in os.environ:
return 2
else:
return xargs.cpu_count()
def _shuffled(seq: Sequence[str]) -> list[str]:
"""Deterministically shuffle"""
fixed_random = random.Random()
fixed_random.seed(FIXED_RANDOM_SEED, version=1)
seq = list(seq)
fixed_random.shuffle(seq)
return seq
def to_nul_delimited_filenames(file_args: Sequence[str]) -> bytes:
ret = NUL.join(os.fsencode(filename) for filename in file_args)
return ret + NUL if ret else ret
def from_nul_delimited_filenames(filenames: bytes) -> list[str]:
return [os.fsdecode(part) for part in filenames.split(NUL) if part]
def run_xargs(
cmd: tuple[str, ...],
file_args: Sequence[str],
*,
require_serial: bool,
color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]:
if pass_filenames_via_stdin:
stdin = to_nul_delimited_filenames(file_args)
return xargs.xargs(
cmd,
(),
target_concurrency=1,
color=color,
input=stdin,
)
if require_serial:
jobs = 1
else:
# Shuffle the files so that they more evenly fill out the xargs
# partitions, but do it deterministically in case a hook cares about
# ordering.
file_args = _shuffled(file_args)
jobs = target_concurrency()
return xargs.xargs(cmd, file_args, target_concurrency=jobs, color=color)
def hook_cmd(entry: str, args: Sequence[str]) -> tuple[str, ...]:
cmd = shlex.split(entry)
if cmd[:2] == ['pre-commit', 'hazmat']:
cmd = [sys.executable, '-m', 'pre_commit.commands.hazmat', *cmd[2:]]
return (*cmd, *args)
def basic_run_hook(
prefix: Prefix,
entry: str,
args: Sequence[str],
file_args: Sequence[str],
*,
is_local: bool,
require_serial: bool,
color: bool,
pass_filenames_via_stdin: bool = False,
) -> tuple[int, bytes]:
return run_xargs(
hook_cmd(entry, args),
file_args,
require_serial=require_serial,
color=color,
pass_filenames_via_stdin=pass_filenames_via_stdin,
)