Trouble Committing Binary Files

Andrei Vermel avermel at mail.ru
Mon Nov 3 23:22:03 CST 2008


> On Mon, 2008-11-03 at 14:48 -0800, Clark Hwang wrote:
> > I can't seem to commit a binary file that's about 300 MB in 
> size.   My
> > workstation has 2 GIB of RAM.  Mercurial tells me "abort: out of
> > memory".
> > I was thinking the 300 MB was not over the 2 x file size limit on my
> > workstation.   Please advise.
> 
> Which version of Mercurial are you using and how much -free- memory do
> you have? If you're running Linux, send the output of the 'free'
> command.
> 
> The factor may actually be more like 5 or 6.
I was able to check in a 170Mb file on a 2Gb win box, but it aborted with
out of memory during check out, so this is probably the narrow place now.

As a workaround, me and my co-workers are now using 'bigfiles' extension.
http://www.selenic.com/pipermail/mercurial/2008-October/021899.html

Below and attached is a latest version.

# bigfiles.py: Support versions of big files with storage outside hg repo. 
#
# Copyright 2008 Andrei Vermel <andrei.vermel at gmail.com>
#
# This software may be used and distributed according to the terms
# of the GNU General Public License, incorporated herein by reference.

'''support versions of big files with storage outside hg repo. 
To setup the extension add to a config file:
[bigfiles]
repo = path/to/versions/dir 

Big files are not put to hg repo. They are listed in a file called 
'.bigfiles', which also serves as an ignore file similar to .hgignore, so
they
do not clutter output of hg commands. The file also stores check sums of the

big files in a form of comments. File '.bigfiles' is versioned by hg, so
each
changeset knows which big files it uses from the names and checksums.
The file can be diffed and merged, which is nice.

The versions of big files are stored in a versions directory, with checksums
attached to filenames.
The extension overrides 'hg update', so that it can compare contents of 
'.bigfiles' before and after the update to remove and fetch appropriate big 
files.
The directory storing versions of big files can be synced with the remote
one 
(the extension doesn't do this, but tells the list of the necessary files).
The versions corresponding to old changesets can be removed to save space.

To add a new big file, use normal 'hg add', ignoring the size warning.
To remove a tracked big file, just delete it.
Then use:
'hg bstatus' - to examine state of big files in working directory.
'hg brefresh' - to refresh '.bigfiles' and versions directory with added, 
    removed and modified big files.
'hg bupdate' - to fetch files from versions directory as recorded in 
    '.bigfiles', and get a list of necessary files missing in the
    version directory.'''

from mercurial.i18n import _
from mercurial.node import *
from mercurial import commands, cmdutil, hg, node, util
import os, stat, cPickle, errno

_sha1 = util.sha1

def parse_bigfiles(repo):
    fname = repo.wjoin('.bigfiles')
    bigfiles = {}
    try:
        for str in open(fname):
            if '#' not in str:
                continue
            path, hash = str.strip().rsplit('#', 1)
            bigfiles[path] = hash
    except IOError, err:
        if err.errno != errno.ENOENT: raise
    return bigfiles 

def bigfiles_repo(ui):
    brepo = ui.config('bigfiles', 'repo')
    if not brepo:
        raise util.Abort(_('bigfiles.repo path not configured'))
    try:
        st=os.lstat(brepo)
        if not stat.S_ISDIR(st.st_mode):
            raise util.Abort(
               _('specified bigfiles repo %s is not a directory') % brepo)
    except OSError:        
        raise util.Abort(_("can't access bigfiles repo: %s") % brepo)
    return brepo

def _hash(f):
    #print("getting hash of %s" % f)
    file = open(f, 'rb')
    s = _sha1("")
    while True: 
        text = file.read(1000000)
        if text=='':
            break
        s.update(text)
    return s.hexdigest()

def read_bigfiledirstate(ui, repo):
    ds = {}
    try:
        fp = open(repo.wjoin(".hg/bigfiledirstate"), "rb")
        ds = cPickle.load(fp)
        fp.close()
    except IOError, err:
        if err.errno != errno.ENOENT: raise
    return ds

def write_bigfiledirstate(ui, repo, ds):
    fp = open(repo.wjoin(".hg/bigfiledirstate"), "wb")
    cPickle.dump(ds, fp)
    fp.close()

def update_bigfiledirstate(repo, file, st, ds):
    ds[file] = (st.st_size, st.st_mtime, _hash(repo.wjoin(file)))

def accelerated_hash(repo, file, st, ds):
    if file in ds:
        t = ds[file]
        if t[0] == st.st_size and t[1] == st.st_mtime:
            return t[2]
    update_bigfiledirstate(repo, file, st, ds)
    return ds[file][2]

def _bigstatus(ui, repo, pats, opts, ds, bigfiles):
    MAX_SIZE = 10000000
    brepo = bigfiles_repo(ui)

    tracked_gotbig = [] # not in .bigfiles
    added_big = []      # not in .bigfiles
    modified = []       # already in .bigfiles
    removed = []        # missing, but still in .bigfiles
    gotsmall = []       # still in .bigfiles
    missinginrepo = []  # file recorded in .bigfiles not in bigfiles repo

    node1, node2 = cmdutil.revpair(repo, None)
    mod_all, added_all = repo.status(node1, node2, 
        cmdutil.match(repo, pats, opts), None, None, True)[0:2]

    for file in mod_all:
        f=repo.wjoin(file)
        fsize=os.lstat(f).st_size
        if fsize > MAX_SIZE:
            tracked_gotbig.append(file)

    for file in added_all:
        f=repo.wjoin(file)
        fsize=os.lstat(f).st_size
        if fsize > MAX_SIZE:
            added_big.append(file)

    for file, hash in bigfiles.iteritems():
        f=repo.wjoin(file)
        try:
            st = os.lstat(f)
        except OSError:
            frepo = "%s/%s.%s" % (brepo, file, hash)
            try:
                os.lstat(frepo)
                removed.append(file)
            except OSError:
                missinginrepo.append(file)
            continue
        if st.st_size <= MAX_SIZE:
            gotsmall.append(file)
        fhash = accelerated_hash(repo, file, st, ds)
        if fhash != hash:
            modified.append(file)

    return tracked_gotbig, added_big, modified, removed, gotsmall, \
        missinginrepo

def bigstatus(ui, repo, *pats, **opts):
    '''show changed big files in the working directory
    Show status of big files in the repository.

    The codes used to show the status of files are:
    B = tracked by hg, got too big. 
    A = added to hg, too big
    M = modified
    S = got small, can now be tracked by hg
    ! = missing in working dir, present in versions repo
    R = missing in both working dir and versions repo'''

    ds = read_bigfiledirstate(ui, repo)
    bigfiles = parse_bigfiles(repo)
    bst = _bigstatus(ui, repo, pats, opts, ds, bigfiles)
    codes = ('B', 'A', 'M', '!', 'S', 'R')
    for files, code in zip(bst, codes):
       for f in files:
         if opts['no_status']:
             ui.write("%s\n" % f)
         else:
             ui.write("%s %s\n" % (code, f))
    write_bigfiledirstate(ui, repo, ds)

def _updatebigrepo(ui, repo, files, brepo, bigfiles, ds):
    for file in files:
        f = repo.wjoin(file)
        hash = accelerated_hash(repo, file, os.lstat(f), ds)
        bigfiles[file] = hash
        rf = "%s/%s.%s" % (brepo, file, hash)
        util.makedirs(os.path.dirname(rf))
        util.copyfile(f, rf)

def bigrefresh(ui, repo, *pats, **opts):
    '''update big files tracking as per working directory. 

    Added big files get forgotten and added to '.bigfiles' instead.
    Removed big files are deleted from '.bigfiles'. 
    Files tracked by hg that got too big are removed from hg, and added 
    to '.bigfiles'. 
    Copies of new and modified big files are stored in versions
directory.'''
 
    ds = read_bigfiledirstate(ui, repo)
    bigfiles = parse_bigfiles(repo)
    tracked_gotbig, added_big, modified, removed, gotsmall, \
        missinginrepo = _bigstatus(ui, repo, pats, opts, ds, bigfiles)
    for f in added_big:
        ui.write(_("forgetting %s\n") % f) 
    if not opts['dry_run']:
        repo.forget(added_big)

    for f in tracked_gotbig:
        ui.write(_("removing %s\n") % f) 
    if not opts['dry_run']:
        repo.remove(tracked_gotbig, unlink=False)

    for f in removed+missinginrepo:
        ui.write(_("recording removal of %s\n") % f) 

    brepo = bigfiles_repo(ui)

    if not opts['dry_run']:
        _updatebigrepo(ui, repo, tracked_gotbig + added_big + modified,
           brepo, bigfiles, ds)
        for file in removed+missinginrepo:
            del bigfiles[file]

        fp = open(repo.wjoin('.bigfiles'), 'w')
        fp.write("syntax: glob\n\n")
        for f in util.sort(bigfiles.keys()):
            fp.write("%s#%s\n" % (f, bigfiles[f]))
        fp.close()
    write_bigfiledirstate(ui, repo, ds)

def bigupdate(ui, repo, *pats, **opts):
    '''fetch files from versions directory as recorded in '.bigfiles'. 
 
    Also complain about necessary files missing in the version directory'''
    ds = read_bigfiledirstate(ui, repo)
    bigfiles = parse_bigfiles(repo)
    tracked_gotbig, added_big, modified, removed, gotsmall, \
        missinginrepo = _bigstatus(ui, repo, pats, opts, ds, bigfiles)
    brepo = bigfiles_repo(ui)

    for file in removed:
        f = repo.wjoin(file)
        hash= bigfiles[file]
        rf = "%s/%s.%s" % (brepo, file, hash)
        ui.write(_("fetching %s\n") % rf) 
        if not opts['dry_run']:
            util.makedirs(os.path.dirname(f))
            util.copyfile(rf, f)

    if missinginrepo:
        ui.write(_("\nNeeded files missing in bigrepo %s:\n") % brepo) 
        for file in missinginrepo:
            hash = bigfiles[file]
            ui.write("%s.%s\n" % (file, hash)) 
    write_bigfiledirstate(ui, repo, ds)

def my_update(ui, repo, node=None, rev=None, clean=False, date=None):
    bigfiles0 = parse_bigfiles(repo)
    res = commands.update(ui, repo, node, rev, clean, date)
    bigfiles1 = parse_bigfiles(repo)

    m1 = None
    m2 = None
    for file in bigfiles0.keys():
        if file not in bigfiles1:
            if not m1:
                parent1, parent2 = repo.dirstate.parents()
                m1 = repo[parent1].manifest()
                m2 = repo[parent2].manifest()
            if file in m1 or file not in m2:
                continue
            try:
                f = repo.wjoin(file)
                os.lstat(f)
                ui.write(_("unlinking %s.%s\n") % (file, bigfiles0[file])) 
                util.unlink(f)
            except OSError:
                pass

    tofetch = {}
    for file, hash in bigfiles1.iteritems():
        if file not in bigfiles0 or bigfiles0[file] != hash:
            tofetch[file] = hash

    if tofetch:
        brepo = bigfiles_repo(ui)
        missing = {}
        for file, hash in tofetch.iteritems():
            f = repo.wjoin(file)
            rf = "%s/%s.%s" % (brepo, file, hash)
            try:
                os.lstat(rf)           
                ui.write(_("fetching %s.%s\n") % (file, hash)) 
                util.makedirs(os.path.dirname(f))
                try:
                    os.lstat(f)
                    util.unlink(f)
                except OSError:
                    pass
                util.copyfile(rf, f)
            except OSError:
                missing[file] = hash
        if missing:
            ui.write(_("\nNeeded files missing in bigrepo %s:\n") % brepo) 
            for file, hash in missing.iteritems():
                ui.write("%s.%s\n" % (file, hash)) 
    return res

def _findrepo(p):
    while not os.path.isdir(os.path.join(p, ".hg")):
        oldp, p = p, os.path.dirname(p)
        if p == oldp:
            return None

    return p

def uisetup(ui):
    tmp = commands.table["^update|up|checkout|co"]
    my_update.__doc__ = tmp[0].__doc__
    commands.table["^update|up|checkout|co"] = (my_update, tmp[1], tmp[2])

    path_bigfiles = _findrepo(os.getcwd())
    if path_bigfiles == None:
        return

    path_bigfiles = path_bigfiles+'/.bigfiles'
    try:
        os.stat(path_bigfiles)
        if ui.parentui:
            ui.parentui.setconfig("ui", "ignore.bigfiles", path_bigfiles)
        else:
            ui.setconfig("ui", "ignore.bigfiles", path_bigfiles)
    except:
        pass

cmdtable = {
    'bigstatus|bstatus':
        (bigstatus,
         [('n', 'no-status', None, _('hide status prefix')),
         ] + commands.walkopts,
        _('hg bigstatus [SOURCE]')),
    'bigrefresh|brefresh':
        (bigrefresh,
         [('n', 'dry-run', None, _('do not perform actions, just print
output')),
         ] + commands.walkopts,
        _('hg bigrefresh [SOURCE]')),
    'bigupdate|bup|bigcheckout|bco':
        (bigupdate,
         [('n', 'dry-run', None, _('do not perform actions, just print
output')),
         ] + commands.walkopts,
        _('hg bigupdate [SOURCE]')),
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: bigfiles.py
Type: application/octet-stream
Size: 11963 bytes
Desc: not available
Url : http://selenic.com/pipermail/mercurial/attachments/20081104/29a6f69e/attachment.obj 


More information about the Mercurial mailing list