Skip to content

filter: add API for registering Python filters #1237

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 1, 2023
63 changes: 63 additions & 0 deletions docs/filters.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
**********************************************************************
Filters
**********************************************************************

pygit2 supports defining and registering libgit2 blob filters implemented
in Python.

The Filter type
===============

.. autoclass:: pygit2.Filter
:members:

.. autoclass:: pygit2.FilterSource

Registering filters
===================

.. autofunction:: pygit2.filter_register
.. autofunction:: pygit2.filter_unregister

Example
=======

The following example is a simple Python implementation of a filter which
enforces that blobs are stored with unix LF line-endings in the ODB, and
checked out with line-endings in accordance with the .gitattributes ``eol``
setting.

.. code-block:: python

class CRLFFilter(pygit2.Filter):
attributes = "text eol=*"

def __init__(self):
super().__init__()
self.linesep = b'\r\n' if os.name == 'nt' else b'\n'
self.buffer = io.BytesIO()

def check(self, src, attr_values):
if src.mode == GIT_FILTER_SMUDGE:
# attr_values contains the values of the 'text' and 'eol'
# attributes in that order (as they are defined in
# CRLFFilter.attributes
eol = attr_values[1]

if eol == 'crlf':
self.linesep = b'\r\n'
elif eol = 'lf':
self.linesep = b'\n'
else: # src.mode == GIT_FILTER_CLEAN
# always use LF line-endings when writing to the ODB
self.linesep = b'\n'

def write(data, src, write_next):
# buffer input data in case line-ending sequences span chunk boundaries
self.buffer.write(data)

def close(self, write_next):
# apply line-ending conversion to our buffered input and write all
# of our output data
self.buffer.seek(0)
write_next(self.linesep.join(self.buffer.read().splitlines()))
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ Table of Contents
config
diff
features
filters
index_file
mailmap
merge
Expand Down
14 changes: 14 additions & 0 deletions docs/objects.rst
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,20 @@ creating the blob object:
.. autofunction:: pygit2.hash
.. autofunction:: pygit2.hashfile

Streaming blob content
----------------------

`pygit2.Blob.data` and `pygit2.Blob.read_raw()` read the full contents of the
blob into memory and return Python ``bytes``. They also return the raw contents
of the blob, and do not apply any filters which would be applied upon checkout
to the working directory.

Raw and filtered blob data can be accessed as a Python Binary I/O stream
(i.e. a file-like object):

.. autoclass:: pygit2.BlobIO
:members:


Trees
=================
Expand Down
4 changes: 4 additions & 0 deletions pygit2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,14 @@

# High level API
from .blame import Blame, BlameHunk
from .blob import BlobIO
from .callbacks import git_clone_options, git_fetch_options, get_credentials
from .callbacks import Payload, RemoteCallbacks, CheckoutCallbacks, StashApplyCallbacks
from .config import Config
from .credentials import *
from .errors import check_error, Passthrough
from .ffi import ffi, C
from .filter import Filter
from .index import Index, IndexEntry
from .remote import Remote
from .repository import Repository
Expand Down Expand Up @@ -92,6 +94,8 @@
GIT_ATTR_CHECK_INDEX_THEN_FILE = C.GIT_ATTR_CHECK_INDEX_THEN_FILE
GIT_ATTR_CHECK_INDEX_ONLY = C.GIT_ATTR_CHECK_INDEX_ONLY
GIT_ATTR_CHECK_NO_SYSTEM = C.GIT_ATTR_CHECK_NO_SYSTEM
GIT_ATTR_CHECK_INCLUDE_HEAD = C.GIT_ATTR_CHECK_INCLUDE_HEAD
GIT_ATTR_CHECK_INCLUDE_COMMIT = C.GIT_ATTR_CHECK_INCLUDE_COMMIT

# GIT_FETCH_PRUNE
GIT_FETCH_PRUNE_UNSPECIFIED = C.GIT_FETCH_PRUNE_UNSPECIFIED
Expand Down
2 changes: 1 addition & 1 deletion pygit2/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@
# Order matters
h_files = [
'types.h',
'attr.h',
'oid.h',
'attr.h',
'blame.h',
'buffer.h',
'strarray.h',
Expand Down
154 changes: 154 additions & 0 deletions pygit2/blob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import io
import threading
import time
from contextlib import AbstractContextManager
from typing import Optional
from queue import Queue

from ._pygit2 import GIT_BLOB_FILTER_CHECK_FOR_BINARY, Blob, Oid


class _BlobIO(io.RawIOBase):
"""Low-level wrapper for streaming blob content.

The underlying libgit2 git_writestream filter chain will be run
in a separate thread. The GIL will be released while running
libgit2 filtering.
"""

def __init__(
self,
blob: Blob,
as_path: Optional[str] = None,
flags: int = GIT_BLOB_FILTER_CHECK_FOR_BINARY,
commit_id: Optional[Oid] = None,
):
super().__init__()
self._blob = blob
self._queue = Queue(maxsize=1)
self._ready = threading.Event()
self._writer_closed = threading.Event()
self._chunk: Optional[bytes] = None
self._thread = threading.Thread(
target=self._blob._write_to_queue,
args=(self._queue, self._ready, self._writer_closed),
kwargs={
"as_path": as_path,
"flags": flags,
"commit_id": commit_id,
},
daemon=True,
)
self._thread.start()

def __exit__(self, exc_type, exc_value, traceback):
self.close()

def isatty():
return False

def readable(self):
return True

def writable(self):
return False

def seekable(self):
return False

def readinto(self, b, /):
try:
while self._chunk is None:
self._ready.wait()
if self._queue.empty():
if self._writer_closed.is_set():
# EOF
return 0
self._ready.clear()
time.sleep(0)
continue
chunk = self._queue.get()
if chunk:
self._chunk = chunk

if len(self._chunk) <= len(b):
bytes_written = len(self._chunk)
b[:bytes_written] = self._chunk
self._chunk = None
return bytes_written
bytes_written = len(b)
b[:] = self._chunk[:bytes_written]
self._chunk = self._chunk[bytes_written:]
return bytes_written
except KeyboardInterrupt:
return 0

def close(self):
try:
self._ready.wait()
self._writer_closed.wait()
while self._queue is not None and not self._queue.empty():
self._queue.get()
self._thread.join()
except KeyboardInterrupt:
pass
self._queue = None


class BlobIO(io.BufferedReader, AbstractContextManager):
"""Read-only wrapper for streaming blob content.

Supports reading both raw and filtered blob content.
Implements io.BufferedReader.

Example:

>>> with BlobIO(blob) as f:
... while True:
... # Read blob data in 1KB chunks until EOF is reached
... chunk = f.read(1024)
... if not chunk:
... break

By default, `BlobIO` will stream the raw contents of the blob, but it
can also be used to stream filtered content (i.e. to read the content
after applying filters which would be used when checking out the blob
to the working directory).

Example:

>>> with BlobIO(blob, as_path='my_file.ext') as f:
... # Read the filtered content which would be returned upon
... # running 'git checkout -- my_file.txt'
... filtered_data = f.read()
"""

def __init__(
self,
blob: Blob,
as_path: Optional[str] = None,
flags: int = GIT_BLOB_FILTER_CHECK_FOR_BINARY,
commit_id: Optional[Oid] = None,
):
"""Wrap the specified blob.

Parameters:
blob: The blob to wrap.
as_path: Filter the contents of the blob as if it had the specified
path. If `as_path` is None, the raw contents of the blob will
be read.
flags: GIT_BLOB_FILTER_* bitflags (only applicable when `as_path`
is set).
commit_oid: Commit to load attributes from when
GIT_BLOB_FILTER_ATTRIBUTES_FROM_COMMIT is specified in `flags`
(only applicable when `as_path` is set).
"""
raw = _BlobIO(blob, as_path=as_path, flags=flags, commit_id=commit_id)
super().__init__(raw)

def __exit__(self, exc_type, exc_value, traceback):
self.close()


io.RawIOBase.register(_BlobIO)
io.BufferedIOBase.register(BlobIO)
17 changes: 14 additions & 3 deletions pygit2/decl/attr.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
#define GIT_ATTR_CHECK_FILE_THEN_INDEX 0
#define GIT_ATTR_CHECK_INDEX_THEN_FILE 1
#define GIT_ATTR_CHECK_INDEX_ONLY 2
#define GIT_ATTR_CHECK_NO_SYSTEM 4
#define GIT_ATTR_CHECK_NO_SYSTEM 4
#define GIT_ATTR_CHECK_INCLUDE_HEAD 8
#define GIT_ATTR_CHECK_INCLUDE_COMMIT 16

#define GIT_ATTR_OPTIONS_VERSION ...

typedef enum {
GIT_ATTR_VALUE_UNSPECIFIED = 0, /**< The attribute has been left unspecified */
Expand All @@ -10,10 +14,17 @@ typedef enum {
GIT_ATTR_VALUE_STRING /**< This attribute has a value */
} git_attr_value_t;

int git_attr_get(
typedef struct {
unsigned int version;
unsigned int flags;
git_oid *commit_id;
git_oid attr_commit_id;
} git_attr_options;

int git_attr_get_ext(
const char **value_out,
git_repository *repo,
uint32_t flags,
git_attr_options *opts,
const char *path,
const char *name);

Expand Down
Loading