[ANNOUNCE] An extension to handle big files

Andrei Vermel avermel at mail.ru
Sun Oct 19 08:13:21 CDT 2008


Here's a more usable version.

# bigfiles.py: Support versions of big files with storage outside hg repo. 
#
# Copyright 2008 Andrei Vermel <andrei.vermel at gmail.com>
#
# This software may be used and distributed according to the terms
# of the GNU General Public License, incorporated herein by reference.

'''support versions of big files with storage outside hg repo. 
To setup the extension add to a config file:
[bigfiles]
repo = path/to/versions/dir 

Big files are not put to hg repo. They are listed in a file called 
'.bigfiles', which also serves as an ignore file similar to .hgignore, so
they
do not clutter output of hg commands. The file also stores check sums of the

big files in a form of comments. File '.bigfiles' is versioned by hg, so
each
changeset knows which big files it uses from the names and checksums.
The file can be diffed and merged, which is nice.

The versions of big files are stored in a versions directory, with checksums
attached to filenames.
The extension overrides 'hg update', so that it can compare contents of 
'.bigfiles' before and after the update to remove and fetch appropriate big 
files.
The directory storing versions of big files can be synced with the remote
one 
(the extension doesn't do this, but tells the list of the necessary files).
The versions corresponding to old changesets can be removed to save space.

To add a new big file, use normal 'hg add', ignoring the size warning.
To remove a tracked big file, just delete it.
Then use:
'hg bstatus' - to examine state of big files in working directory.
'hg brefresh' - to refresh '.bigfiles' and versions directory with added, 
    removed and modified big files.
'hg bupdate' - to fetch files from versions directory as recorded in 
    '.bigfiles', and get a list of necessary files missing in the
    version directory.'''

from mercurial.i18n import _
from mercurial.node import *
from mercurial import commands, cmdutil, hg, node, util
import os, stat, cPickle, errno

_sha1 = util.sha1

def parse_bigfiles(repo):
    fname = repo.wjoin('.bigfiles')
    bigfiles = {}
    try:
        for str in open(fname):
            if '#' not in str:
                continue
            path, hash = str.strip().rsplit('#', 1)
            bigfiles[path] = hash
    except IOError, err:
        if err.errno != errno.ENOENT: raise
    return bigfiles 

def bigfiles_repo(ui):
    brepo = ui.config('bigfiles', 'repo')
    if not brepo:
        raise util.Abort(_('bigfiles.repo path not configured'))
    try:
        st=os.lstat(brepo)
        if not stat.S_ISDIR(st.st_mode):
            raise util.Abort(
               _('specified bigfiles repo %s is not a directory') % brepo)
    except OSError:        
        raise util.Abort(_("can't access bigfiles repo: %s") % brepo)
    return brepo

def _hash(f):
    #print("getting hash of %s" % f)
    file = open(f, 'rb')
    s = _sha1("")
    while True: 
        text = file.read(1000000)
        if text=='':
            break
        s.update(text)
    return s.hexdigest()

def read_bigfiledirstate(ui, repo):
    ds = {}
    try:
        fp = open(repo.wjoin(".hg/bigfiledirstate"), "rb")
        ds = cPickle.load(fp)
        fp.close()
    except IOError, err:
        if err.errno != errno.ENOENT: raise
    return ds

def write_bigfiledirstate(ui, repo, ds):
    fp = open(repo.wjoin(".hg/bigfiledirstate"), "wb")
    cPickle.dump(ds, fp)
    fp.close()

def update_bigfiledirstate(repo, file, st, ds):
    ds[file] = (st.st_size, st.st_mtime, _hash(repo.wjoin(file)))

def accelerated_hash(repo, file, st, ds):
    if file in ds:
        t = ds[file]
        if t[0] == st.st_size and t[1] == st.st_mtime:
            return t[2]
    update_bigfiledirstate(repo, file, st, ds)
    return ds[file][2]

def _bigstatus(ui, repo, pats, opts, ds, bigfiles):
    MAX_SIZE = 10000000
    brepo = bigfiles_repo(ui)

    tracked_gotbig = [] # not in .bigfiles
    added_big = []      # not in .bigfiles
    modified = []       # already in .bigfiles
    removed = []        # missing, but still in .bigfiles
    gotsmall = []       # still in .bigfiles
    missinginrepo = []  # file recorded in .bigfiles not in bigfiles repo

    node1, node2 = cmdutil.revpair(repo, None)
    mod_all, added_all = repo.status(node1, node2, 
        cmdutil.match(repo, pats, opts), None, None, True)[0:2]

    for file in mod_all:
        f=repo.wjoin(file)
        fsize=os.lstat(f).st_size
        if fsize > MAX_SIZE:
            tracked_gotbig.append(file)

    for file in added_all:
        f=repo.wjoin(file)
        fsize=os.lstat(f).st_size
        if fsize > MAX_SIZE:
            added_big.append(file)

    for file, hash in bigfiles.iteritems():
        f=repo.wjoin(file)
        try:
            st = os.lstat(f)
        except OSError:
            frepo = "%s/%s.%s" % (brepo, file, hash)
            try:
                os.lstat(frepo)
                removed.append(file)
            except OSError:
                missinginrepo.append(file)
            continue
        if st.st_size <= MAX_SIZE:
            gotsmall.append(file)
        fhash = accelerated_hash(repo, file, st, ds)
        if fhash != hash:
            modified.append(file)

    return tracked_gotbig, added_big, modified, removed, gotsmall, \
        missinginrepo

def bigstatus(ui, repo, *pats, **opts):
    '''show changed big files in the working directory
    Show status of big files in the repository.

    The codes used to show the status of files are:
    B = tracked by hg, got too big. 
    A = added to hg, too big
    M = modified
    S = got small, can now be tracked by hg
    ! = missing in working dir, present in versions repo
    R = missing in both working dir and versions repo'''

    ds = read_bigfiledirstate(ui, repo)
    bigfiles = parse_bigfiles(repo)
    bst = _bigstatus(ui, repo, pats, opts, ds, bigfiles)
    codes = ('B', 'A', 'M', '!', 'S', 'R')
    for files, code in zip(bst, codes):
       for f in files:
         if opts['no_status']:
             ui.write("%s\n" % f)
         else:
             ui.write("%s %s\n" % (code, f))
    write_bigfiledirstate(ui, repo, ds)

def _updatebigrepo(ui, repo, files, brepo, bigfiles, ds):
    for file in files:
        f = repo.wjoin(file)
        hash = accelerated_hash(repo, file, os.lstat(f), ds)
        bigfiles[file] = hash
        rf = "%s/%s.%s" % (brepo, file, hash)
        util.makedirs(os.path.dirname(rf))
        util.copyfile(f, rf)

def bigrefresh(ui, repo, *pats, **opts):
    '''update big files tracking as per working directory. 

    Added big files get forgotten and added to '.bigfiles' instead.
    Removed big files are deleted from '.bigfiles'. 
    Files tracked by hg that got too big are removed from hg, and added 
    to '.bigfiles'. 
    Copies of new and modified big files are stored in versions
directory.'''
 
    ds = read_bigfiledirstate(ui, repo)
    bigfiles = parse_bigfiles(repo)
    tracked_gotbig, added_big, modified, removed, gotsmall, \
        missinginrepo = _bigstatus(ui, repo, pats, opts, ds, bigfiles)
    for f in added_big:
        ui.write(_("forgetting %s\n") % f) 
    if not opts['dry_run']:
        repo.forget(added_big)

    for f in tracked_gotbig:
        ui.write(_("removing %s\n") % f) 
    if not opts['dry_run']:
        repo.remove(tracked_gotbig, unlink=False)

    for f in removed+missinginrepo:
        ui.write(_("recording removal of %s\n") % f) 

    brepo = bigfiles_repo(ui)

    if not opts['dry_run']:
        _updatebigrepo(ui, repo, tracked_gotbig + added_big + modified,
           brepo, bigfiles, ds)
        for file in removed+missinginrepo:
            del bigfiles[file]

        fp = open(repo.wjoin('.bigfiles'), 'w')
        fp.write("syntax: glob\n\n")
        for f in util.sort(bigfiles.keys()):
            fp.write("%s#%s\n" % (f, bigfiles[f]))
        fp.close()
    write_bigfiledirstate(ui, repo, ds)

def bigupdate(ui, repo, *pats, **opts):
    '''fetch files from versions directory as recorded in '.bigfiles'. 
 
    Also complain about necessary files missing in the version directory'''
    ds = read_bigfiledirstate(ui, repo)
    bigfiles = parse_bigfiles(repo)
    tracked_gotbig, added_big, modified, removed, gotsmall, \
        missinginrepo = _bigstatus(ui, repo, pats, opts, ds, bigfiles)
    brepo = bigfiles_repo(ui)

    for file in removed:
        f = repo.wjoin(file)
        hash= bigfiles[file]
        rf = "%s/%s.%s" % (brepo, file, hash)
        ui.write(_("fetching %s\n") % rf) 
        if not opts['dry_run']:
            util.makedirs(os.path.dirname(f))
            util.copyfile(rf, f)

    if missinginrepo:
        ui.write(_("\nNeeded files missing in bigrepo %s:\n") % brepo) 
        for file in missinginrepo:
            hash = bigfiles[file]
            ui.write("%s.%s\n" % (file, hash)) 
    write_bigfiledirstate(ui, repo, ds)

def my_update(ui, repo, node=None, rev=None, clean=False, date=None):
    bigfiles0 = parse_bigfiles(repo)
    res = commands.update(ui, repo, node, rev, clean, date)
    bigfiles1 = parse_bigfiles(repo)
    for file in bigfiles0.keys():
        if file not in bigfiles1:
            try:
                f = repo.wjoin(file)
                os.lstat(f)
                ui.write(_("unlinking %s.%s\n") % (file, bigfiles0[file])) 
                util.unlink(f)
            except OSError:
                pass

    tofetch = {}
    for file, hash in bigfiles1.iteritems():
        if file not in bigfiles0 or bigfiles0[file] != hash:
            tofetch[file] = hash

    if tofetch:
        brepo = bigfiles_repo(ui)
        missing = {}
        for file, hash in tofetch.iteritems():
            f = repo.wjoin(file)
            rf = "%s/%s.%s" % (brepo, file, hash)
            try:
                os.lstat(rf)           
                ui.write(_("fetching %s.%s\n") % (file, hash)) 
                util.makedirs(os.path.dirname(f))
                util.copyfile(rf, f)
            except OSError:
                missing[file] = hash
        if missing:
            ui.write(_("\nNeeded files missing in bigrepo %s:\n") % brepo) 
            for file, hash in missing.iteritems():
                ui.write("%s.%s\n" % (file, hash)) 
    return res

def _findrepo(p):
    while not os.path.isdir(os.path.join(p, ".hg")):
        oldp, p = p, os.path.dirname(p)
        if p == oldp:
            return None

    return p

def uisetup(ui):
    tmp = commands.table["^update|up|checkout|co"]
    my_update.__doc__ = tmp[0].__doc__
    commands.table["^update|up|checkout|co"] = (my_update, tmp[1], tmp[2])

    path_bigfiles = _findrepo(os.getcwd())+'/.bigfiles'
    try:
        os.stat(path_bigfiles)
        if ui.parentui:
            ui.parentui.setconfig("ui", "ignore.bigfiles", path_bigfiles)
        else:
            ui.setconfig("ui", "ignore.bigfiles", path_bigfiles)
    except:
        pass

cmdtable = {
    'bigstatus|bstatus':
        (bigstatus,
         [('n', 'no-status', None, _('hide status prefix')),
         ] + commands.walkopts,
        _('hg bigstatus [SOURCE]')),
    'bigrefresh|brefresh':
        (bigrefresh,
         [('n', 'dry-run', None, _('do not perform actions, just print
output')),
         ] + commands.walkopts,
        _('hg bigrefresh [SOURCE]')),
    'bigupdate|bup|bigcheckout|bco':
        (bigupdate,
         [('n', 'dry-run', None, _('do not perform actions, just print
output')),
         ] + commands.walkopts,
        _('hg bigupdate [SOURCE]')),
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: bigfiles.py
Type: application/octet-stream
Size: 11465 bytes
Desc: not available
Url : http://selenic.com/pipermail/mercurial/attachments/20081019/91c31e1b/attachment.obj 


More information about the Mercurial mailing list