mirror of
https://github.com/pre-commit/pre-commit.git
synced 2026-04-15 10:01:46 +04:00
Add file classification package
This commit is contained in:
parent
2b22a36b1c
commit
cebba0404d
7 changed files with 324 additions and 0 deletions
0
pre_commit/file_classifier/__init__.py
Normal file
0
pre_commit/file_classifier/__init__.py
Normal file
142
pre_commit/file_classifier/classifier.py
Normal file
142
pre_commit/file_classifier/classifier.py
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
# encoding: utf-8
|
||||
import io
|
||||
import re
|
||||
import string
|
||||
from itertools import chain
|
||||
from os.path import basename
|
||||
|
||||
from pre_commit.file_classifier.extensions import KNOWN_EXTENSIONS
|
||||
from pre_commit.file_classifier.interpreters import KNOWN_INTERPRETERS
|
||||
from pre_commit.git import GIT_MODE_EXECUTABLE
|
||||
from pre_commit.git import GIT_MODE_FILE
|
||||
from pre_commit.git import GIT_MODE_SUBMODULE
|
||||
from pre_commit.git import GIT_MODE_SYMLINK
|
||||
|
||||
|
||||
def classify(path, mode):
|
||||
"""Return a set of tags for a file.
|
||||
|
||||
:param path: path to the file
|
||||
:param mode: Git mode of the file
|
||||
:return: set of tags
|
||||
"""
|
||||
tags = set()
|
||||
|
||||
if mode in (GIT_MODE_FILE, GIT_MODE_EXECUTABLE):
|
||||
tags.add('file')
|
||||
|
||||
types = _guess_types_from_extension(path)
|
||||
if types:
|
||||
tags.update(types)
|
||||
|
||||
if _file_is_binary(path):
|
||||
tags.add('binary')
|
||||
else:
|
||||
tags.add('text')
|
||||
if not types:
|
||||
# only check the shebang if we couldn't guess by extension;
|
||||
# it's much slower
|
||||
tags.update(_guess_types_from_shebang(path))
|
||||
|
||||
if mode == GIT_MODE_EXECUTABLE:
|
||||
tags.add('executable')
|
||||
else:
|
||||
tags.add('nonexecutable')
|
||||
|
||||
elif mode == GIT_MODE_SYMLINK:
|
||||
tags.add('symlink')
|
||||
elif mode == GIT_MODE_SUBMODULE:
|
||||
tags.add('submodule')
|
||||
else:
|
||||
raise ValueError('Unknown git object mode: {}'.format(mode))
|
||||
|
||||
return tags
|
||||
|
||||
|
||||
def _guess_types_from_extension(path):
|
||||
"""Guess types for a file based on extension.
|
||||
|
||||
An extension could map to multiple file types, in which case we return the
|
||||
concatenation of types.
|
||||
"""
|
||||
filename = basename(path)
|
||||
return list(chain.from_iterable(
|
||||
types for regex, types in KNOWN_EXTENSIONS
|
||||
if re.search(regex, filename)
|
||||
))
|
||||
|
||||
|
||||
def _guess_types_from_shebang(path):
|
||||
"""Guess types for a text file based on shebang.
|
||||
|
||||
A shebang could map to multiple file types, in which case we return the
|
||||
concatenation of types.
|
||||
"""
|
||||
interpreter = _read_interpreter_from_shebang(path)
|
||||
if interpreter:
|
||||
return chain.from_iterable(
|
||||
types for regex, types in KNOWN_INTERPRETERS
|
||||
if re.match(regex, interpreter)
|
||||
)
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
def _read_interpreter_from_shebang(path):
|
||||
"""Read an interpreter from a file's shebang.
|
||||
|
||||
The first line of a script is guaranteed to be ASCII, so we read ASCII
|
||||
until we hit a newline (at which point we check if we read a valid shebang)
|
||||
or a non-ASCII character (at which point we bail).
|
||||
|
||||
:param path: path to text file
|
||||
:return: interpreter, or None if no shebang could be read
|
||||
"""
|
||||
MAX_SHEBANG_LENGTH = 128 # Linux kernel limit on shebangs
|
||||
|
||||
with io.open(path, 'rb') as f:
|
||||
bytes_read = f.read(MAX_SHEBANG_LENGTH)
|
||||
|
||||
chars_read = ''
|
||||
for i in range(MAX_SHEBANG_LENGTH):
|
||||
try:
|
||||
char = bytes_read[i:i + 1].decode('ascii')
|
||||
if char not in string.printable:
|
||||
return None
|
||||
except UnicodeDecodeError:
|
||||
return None # no valid shebang
|
||||
|
||||
if char != '\n':
|
||||
chars_read += char
|
||||
else:
|
||||
break
|
||||
|
||||
if chars_read.startswith('#!'):
|
||||
words = chars_read[2:].strip().split()
|
||||
if not words or not words[0]:
|
||||
return None
|
||||
|
||||
# take the first word of the shebang as the interpreter, unless that
|
||||
# word is something like /usr/bin/env
|
||||
if words[0].endswith('/env') and len(words) == 2:
|
||||
interpreter = words[1]
|
||||
else:
|
||||
interpreter = words[0]
|
||||
|
||||
return interpreter.split('/')[-1]
|
||||
|
||||
|
||||
def _file_is_binary(path):
|
||||
"""Return whether the file seems to be binary.
|
||||
|
||||
This is roughly based on libmagic's binary/text detection:
|
||||
https://github.com/file/file/blob/master/src/encoding.c#L203-L228
|
||||
"""
|
||||
text_chars = (
|
||||
bytearray([7, 8, 9, 10, 12, 13, 27]) +
|
||||
bytearray(range(0x20, 0x7F)) +
|
||||
bytearray(range(0x80, 0X100))
|
||||
)
|
||||
with io.open(path, 'rb') as f:
|
||||
b = f.read(1024) # only read first KB
|
||||
return bool(b.translate(None, text_chars))
|
||||
28
pre_commit/file_classifier/extensions.py
Normal file
28
pre_commit/file_classifier/extensions.py
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
"""List of known filename to file type mappings.
|
||||
|
||||
The list consists of tuples of (filename regex, list of types).
|
||||
|
||||
Most of these are extensions (e.g. *.py -> python), but some just use the
|
||||
filename (e.g. Makefile -> make).
|
||||
"""
|
||||
KNOWN_EXTENSIONS = [
|
||||
(r'\.js$', ['javascript']),
|
||||
(r'\.json$', ['json']),
|
||||
(r'\.py$', ['python']),
|
||||
(r'\.rb$', ['ruby']),
|
||||
(r'\.sh$', ['shell']),
|
||||
(r'\.e?ya?ml$', ['yaml']),
|
||||
(r'\.pp$', ['puppet']),
|
||||
(r'\.erb$', ['erb']),
|
||||
(r'\.json$', ['json']),
|
||||
(r'\.xml$', ['xml']),
|
||||
(r'\.c$', ['c']),
|
||||
(r'^Makefile$', ['make']),
|
||||
(r'\.mk$', ['make']),
|
||||
(r'\.png$', ['png']),
|
||||
(r'\.gif$', ['gif']),
|
||||
(r'\.svg$', ['svg']),
|
||||
(r'\.css$', ['css']),
|
||||
(r'\.html?$', ['html']),
|
||||
(r'\.php\d?$', ['php']),
|
||||
]
|
||||
6
pre_commit/file_classifier/interpreters.py
Normal file
6
pre_commit/file_classifier/interpreters.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
KNOWN_INTERPRETERS = [
|
||||
('^python([23](\.[0-9]+)?)?$', ['python']),
|
||||
('^(ba|da|tc|[ckz])?sh$', ['shell']),
|
||||
('^ruby$', ['ruby']),
|
||||
('^node(js)?$', ['javascript']),
|
||||
]
|
||||
Loading…
Add table
Add a link
Reference in a new issue