Home > Uncategorized > Gzip files in a git history

Gzip files in a git history

I’m migrating git repositories with large files to LFS. LFS does not support local compression and my app does support gzip compressed files so I guess its better to compress files. Thus I wrote a git filter-branch --index-filter script which gzip files in all commits. I use a pickle file to do not recompress already compressed files. It compress a 4GiB repository (.git size) of 200 commits in 2.5 hours which is not that good. I guess running faster would require hacking BFG Repo-Cleaner or use pigz instead of the Python zlib module.

from subprocess import *
import os
import tempfile
import gzip
import pickle
import shutil

def should_compress(name):
    # To be adapted to you needs
    return name.endswith('.app')

def git(*args):
    res = call(['git'] + list(args))
    assert res == 0

# Map uncompressed file SHA1 to compressed file SHA1
db_file = os.path.join(os.environ["GIT_DIR"], "gzip_rewrite.pickle")
try:
    with open(db_file) as f:
        gzipdb = pickle.load(f)
except IOError:
    gzipdb = {}

commit = os.environ['GIT_COMMIT']

# Iterate of all files in the current commit
for l in check_output(['git', 'ls-tree', '-r', commit]).splitlines():
    f_mod, f_type, f_sha1, f_name = l.split()
    if should_compress(f_name):
        if f_sha1 not in gzipdb:
            p_show = Popen(['git', 'show', commit+':'+f_name], stdout=PIPE)
            p_hash = Popen(['git', 'hash-object', '-w', '--stdin'], stdin=PIPE, stdout=PIPE)
            print 'Compressing', f_name
            shutil.copyfileobj(p_show.stdout, gzip.GzipFile(fileobj=p_hash.stdin), 2**20)
            out, err = p_hash.communicate()
            assert out is not None
            gzipdb[f_sha1] = out
        git('update-index', '--add', '--cacheinfo', f_mod, gzipdb[f_sha1], f_name+'.gz')
        git('update-index', '--remove', f_name)

with open(db_file, "w") as f:
    pickle.dump(gzipdb, f, -1)
Advertisements
  1. No comments yet.
  1. No trackbacks yet.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: