From cebba0404d848490d1076c3d37129e896d37afbd Mon Sep 17 00:00:00 2001 From: Chris Kuehl Date: Mon, 10 Aug 2015 17:32:18 -0700 Subject: [PATCH] Add file classification package --- pre_commit/file_classifier/__init__.py | 0 pre_commit/file_classifier/classifier.py | 142 ++++++++++++++++++++ pre_commit/file_classifier/extensions.py | 28 ++++ pre_commit/file_classifier/interpreters.py | 6 + pre_commit/git.py | 5 + tests/file_classifier/__init__.py | 0 tests/file_classifier/classifier_test.py | 143 +++++++++++++++++++++ 7 files changed, 324 insertions(+) create mode 100644 pre_commit/file_classifier/__init__.py create mode 100644 pre_commit/file_classifier/classifier.py create mode 100644 pre_commit/file_classifier/extensions.py create mode 100644 pre_commit/file_classifier/interpreters.py create mode 100644 tests/file_classifier/__init__.py create mode 100644 tests/file_classifier/classifier_test.py diff --git a/pre_commit/file_classifier/__init__.py b/pre_commit/file_classifier/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pre_commit/file_classifier/classifier.py b/pre_commit/file_classifier/classifier.py new file mode 100644 index 00000000..42481d23 --- /dev/null +++ b/pre_commit/file_classifier/classifier.py @@ -0,0 +1,142 @@ +# encoding: utf-8 +import io +import re +import string +from itertools import chain +from os.path import basename + +from pre_commit.file_classifier.extensions import KNOWN_EXTENSIONS +from pre_commit.file_classifier.interpreters import KNOWN_INTERPRETERS +from pre_commit.git import GIT_MODE_EXECUTABLE +from pre_commit.git import GIT_MODE_FILE +from pre_commit.git import GIT_MODE_SUBMODULE +from pre_commit.git import GIT_MODE_SYMLINK + + +def classify(path, mode): + """Return a set of tags for a file. + + :param path: path to the file + :param mode: Git mode of the file + :return: set of tags + """ + tags = set() + + if mode in (GIT_MODE_FILE, GIT_MODE_EXECUTABLE): + tags.add('file') + + types = _guess_types_from_extension(path) + if types: + tags.update(types) + + if _file_is_binary(path): + tags.add('binary') + else: + tags.add('text') + if not types: + # only check the shebang if we couldn't guess by extension; + # it's much slower + tags.update(_guess_types_from_shebang(path)) + + if mode == GIT_MODE_EXECUTABLE: + tags.add('executable') + else: + tags.add('nonexecutable') + + elif mode == GIT_MODE_SYMLINK: + tags.add('symlink') + elif mode == GIT_MODE_SUBMODULE: + tags.add('submodule') + else: + raise ValueError('Unknown git object mode: {}'.format(mode)) + + return tags + + +def _guess_types_from_extension(path): + """Guess types for a file based on extension. + + An extension could map to multiple file types, in which case we return the + concatenation of types. + """ + filename = basename(path) + return list(chain.from_iterable( + types for regex, types in KNOWN_EXTENSIONS + if re.search(regex, filename) + )) + + +def _guess_types_from_shebang(path): + """Guess types for a text file based on shebang. + + A shebang could map to multiple file types, in which case we return the + concatenation of types. + """ + interpreter = _read_interpreter_from_shebang(path) + if interpreter: + return chain.from_iterable( + types for regex, types in KNOWN_INTERPRETERS + if re.match(regex, interpreter) + ) + else: + return [] + + +def _read_interpreter_from_shebang(path): + """Read an interpreter from a file's shebang. + + The first line of a script is guaranteed to be ASCII, so we read ASCII + until we hit a newline (at which point we check if we read a valid shebang) + or a non-ASCII character (at which point we bail). + + :param path: path to text file + :return: interpreter, or None if no shebang could be read + """ + MAX_SHEBANG_LENGTH = 128 # Linux kernel limit on shebangs + + with io.open(path, 'rb') as f: + bytes_read = f.read(MAX_SHEBANG_LENGTH) + + chars_read = '' + for i in range(MAX_SHEBANG_LENGTH): + try: + char = bytes_read[i:i + 1].decode('ascii') + if char not in string.printable: + return None + except UnicodeDecodeError: + return None # no valid shebang + + if char != '\n': + chars_read += char + else: + break + + if chars_read.startswith('#!'): + words = chars_read[2:].strip().split() + if not words or not words[0]: + return None + + # take the first word of the shebang as the interpreter, unless that + # word is something like /usr/bin/env + if words[0].endswith('/env') and len(words) == 2: + interpreter = words[1] + else: + interpreter = words[0] + + return interpreter.split('/')[-1] + + +def _file_is_binary(path): + """Return whether the file seems to be binary. + + This is roughly based on libmagic's binary/text detection: + https://github.com/file/file/blob/master/src/encoding.c#L203-L228 + """ + text_chars = ( + bytearray([7, 8, 9, 10, 12, 13, 27]) + + bytearray(range(0x20, 0x7F)) + + bytearray(range(0x80, 0X100)) + ) + with io.open(path, 'rb') as f: + b = f.read(1024) # only read first KB + return bool(b.translate(None, text_chars)) diff --git a/pre_commit/file_classifier/extensions.py b/pre_commit/file_classifier/extensions.py new file mode 100644 index 00000000..098a7162 --- /dev/null +++ b/pre_commit/file_classifier/extensions.py @@ -0,0 +1,28 @@ +"""List of known filename to file type mappings. + +The list consists of tuples of (filename regex, list of types). + +Most of these are extensions (e.g. *.py -> python), but some just use the +filename (e.g. Makefile -> make). +""" +KNOWN_EXTENSIONS = [ + (r'\.js$', ['javascript']), + (r'\.json$', ['json']), + (r'\.py$', ['python']), + (r'\.rb$', ['ruby']), + (r'\.sh$', ['shell']), + (r'\.e?ya?ml$', ['yaml']), + (r'\.pp$', ['puppet']), + (r'\.erb$', ['erb']), + (r'\.json$', ['json']), + (r'\.xml$', ['xml']), + (r'\.c$', ['c']), + (r'^Makefile$', ['make']), + (r'\.mk$', ['make']), + (r'\.png$', ['png']), + (r'\.gif$', ['gif']), + (r'\.svg$', ['svg']), + (r'\.css$', ['css']), + (r'\.html?$', ['html']), + (r'\.php\d?$', ['php']), +] diff --git a/pre_commit/file_classifier/interpreters.py b/pre_commit/file_classifier/interpreters.py new file mode 100644 index 00000000..1434ac25 --- /dev/null +++ b/pre_commit/file_classifier/interpreters.py @@ -0,0 +1,6 @@ +KNOWN_INTERPRETERS = [ + ('^python([23](\.[0-9]+)?)?$', ['python']), + ('^(ba|da|tc|[ckz])?sh$', ['shell']), + ('^ruby$', ['ruby']), + ('^node(js)?$', ['javascript']), +] diff --git a/pre_commit/git.py b/pre_commit/git.py index 796a0b8a..2d37d344 100644 --- a/pre_commit/git.py +++ b/pre_commit/git.py @@ -11,6 +11,11 @@ from pre_commit.util import CalledProcessError from pre_commit.util import cmd_output from pre_commit.util import memoize_by_cwd +GIT_MODE_FILE = 0o100644 +GIT_MODE_EXECUTABLE = 0o100755 +GIT_MODE_SYMLINK = 0o120000 +GIT_MODE_SUBMODULE = 0o160000 + logger = logging.getLogger('pre_commit') diff --git a/tests/file_classifier/__init__.py b/tests/file_classifier/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/file_classifier/classifier_test.py b/tests/file_classifier/classifier_test.py new file mode 100644 index 00000000..a115b482 --- /dev/null +++ b/tests/file_classifier/classifier_test.py @@ -0,0 +1,143 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from contextlib import contextmanager + +import mock +import pytest + +from pre_commit.file_classifier.classifier import _file_is_binary +from pre_commit.file_classifier.classifier import _guess_types_from_extension +from pre_commit.file_classifier.classifier import _guess_types_from_shebang +from pre_commit.file_classifier.classifier import _read_interpreter_from_shebang # noqa +from pre_commit.file_classifier.classifier import classify +from pre_commit.git import GIT_MODE_EXECUTABLE +from pre_commit.git import GIT_MODE_FILE +from pre_commit.git import GIT_MODE_SUBMODULE +from pre_commit.git import GIT_MODE_SYMLINK + + +@contextmanager +def mock_open(read_data): + # mock_open doesn't support reading binary data :\ + # https://bugs.python.org/issue23004 + with mock.patch('io.open') as m: + mock_read = m.return_value.__enter__().read + mock_read.return_value = read_data + yield m + + +@pytest.mark.parametrize('path,data,mode,expected', [ + ( + 'test.py', + b'def main():\n pass\n', + GIT_MODE_FILE, + ['file', 'text', 'python', 'nonexecutable'], + ), + ( + 'Makefile', + b'test:\n\ttac /etc/passwd\n', + GIT_MODE_FILE, + ['file', 'text', 'make', 'nonexecutable'], + ), + ( + 'delete-everything', + b'#!/bin/bash\nrm -rf /\n', + GIT_MODE_EXECUTABLE, + ['file', 'text', 'shell', 'executable'], + ), + ( + 'bin/bash', + b'\x7f\x45\x4c\x46\x02\x01\x01', + GIT_MODE_EXECUTABLE, + ['file', 'binary', 'executable'], + ), + ( + 'modules/apache2', + None, + GIT_MODE_SUBMODULE, + ['submodule'], + ), + ( + 'some/secret', + None, + GIT_MODE_SYMLINK, + ['symlink'], + ), +]) +def test_classify(path, data, mode, expected): + with mock_open(data): + assert set(classify(path, mode)) == set(expected) + + +def test_classify_invalid(): + # should raise ValueError if given a mode that it doesn't know about + with pytest.raises(ValueError): + classify('some_path', 9999) + + +@pytest.mark.parametrize('path,expected', [ + ('/hello/foo.py', ['python']), + ('a/b/c/d/e.rb', ['ruby']), + ('derp.sh', ['shell']), + ('derp.tmpl.sh', ['shell']), + + ('', []), + ('derpsh', []), + ('\x7f\x45\x4c\x46\x02\x01\x01\x00\x00', []), +]) +def test_guess_types_from_extension(path, expected): + assert set(_guess_types_from_extension(path)) == set(expected) + + +@pytest.mark.parametrize('data,expected', [ + (b'#!/usr/bin/env python3\nasdf', ['python']), + (b'#!/usr/bin/env /usr/bin/python2.7\nasdf', ['python']), + (b'#!/bin/bash -euxm', ['shell']), + (b'#!/bin/sh -euxm', ['shell']), + + (b'', []), + (b'\x7f\x45\x4c\x46\x02\x01\x01\x00\x00', []), +]) +def test_guess_types_from_shebang(data, expected): + with mock_open(data): + assert set(_guess_types_from_shebang('/etc/passwd')) == set(expected) + + +@pytest.mark.parametrize('data,expected', [ + (b'#!/usr/bin/env python3\nasdf', 'python3'), + (b'#!/bin/bash -euxm', 'bash'), + (b'#!/bin/bash -e -u -x -m', 'bash'), + (b'#! /usr/bin/python ', 'python'), + + (b'what is this', None), + (b'', None), + (b'#!\n/usr/bin/python', None), + (b'\n#!/usr/bin/python', None), + ('#!/usr/bin/énv python3\nasdf'.encode('utf8'), None), + (b'#! ', None), + (b'\x7f\x45\x4c\x46\x02\x01\x01\x00\x00', None), + (b'#!\x7f\x45\x4c\x46\x02\x01\x01\x00\x00', None), +]) +def test_read_interpreter_from_shebang(data, expected): + with mock_open(data) as m: + assert _read_interpreter_from_shebang('/etc/passwd') == expected + m.assert_called_once_with('/etc/passwd', 'rb') + + +@pytest.mark.parametrize('data,expected', [ + (b'hello world', False), + (b'', False), + ('éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)ノ'.encode('utf8'), False), + ('¯\_(ツ)_/¯'.encode('utf8'), False), + ('♪┏(・o・)┛♪┗ ( ・o・) ┓♪┏ ( ) ┛♪┗ (・o・ ) ┓♪┏(・o・)┛♪'.encode('utf8'), False), + ('éóñå'.encode('latin1'), False), + + (b'hello world\x00', True), + (b'\x7f\x45\x4c\x46\x02\x01\x01', True), # first few bytes of /bin/bash + (b'\x43\x92\xd9\x0f\xaf\x32\x2c', True), # some /dev/urandom output +]) +def test_file_is_binary(data, expected): + with mock_open(data) as m: + assert _file_is_binary('/etc/passwd') is expected + m.assert_called_once_with('/etc/passwd', 'rb')