Trouble Committing Binary Files
Andrei Vermel
avermel at mail.ru
Mon Nov 3 23:22:03 CST 2008
> On Mon, 2008-11-03 at 14:48 -0800, Clark Hwang wrote:
> > I can't seem to commit a binary file that's about 300 MB in
> size. My
> > workstation has 2 GIB of RAM. Mercurial tells me "abort: out of
> > memory".
> > I was thinking the 300 MB was not over the 2 x file size limit on my
> > workstation. Please advise.
>
> Which version of Mercurial are you using and how much -free- memory do
> you have? If you're running Linux, send the output of the 'free'
> command.
>
> The factor may actually be more like 5 or 6.
I was able to check in a 170Mb file on a 2Gb win box, but it aborted with
out of memory during check out, so this is probably the narrow place now.
As a workaround, me and my co-workers are now using 'bigfiles' extension.
http://www.selenic.com/pipermail/mercurial/2008-October/021899.html
Below and attached is a latest version.
# bigfiles.py: Support versions of big files with storage outside hg repo.
#
# Copyright 2008 Andrei Vermel <andrei.vermel at gmail.com>
#
# This software may be used and distributed according to the terms
# of the GNU General Public License, incorporated herein by reference.
'''support versions of big files with storage outside hg repo.
To setup the extension add to a config file:
[bigfiles]
repo = path/to/versions/dir
Big files are not put to hg repo. They are listed in a file called
'.bigfiles', which also serves as an ignore file similar to .hgignore, so
they
do not clutter output of hg commands. The file also stores check sums of the
big files in a form of comments. File '.bigfiles' is versioned by hg, so
each
changeset knows which big files it uses from the names and checksums.
The file can be diffed and merged, which is nice.
The versions of big files are stored in a versions directory, with checksums
attached to filenames.
The extension overrides 'hg update', so that it can compare contents of
'.bigfiles' before and after the update to remove and fetch appropriate big
files.
The directory storing versions of big files can be synced with the remote
one
(the extension doesn't do this, but tells the list of the necessary files).
The versions corresponding to old changesets can be removed to save space.
To add a new big file, use normal 'hg add', ignoring the size warning.
To remove a tracked big file, just delete it.
Then use:
'hg bstatus' - to examine state of big files in working directory.
'hg brefresh' - to refresh '.bigfiles' and versions directory with added,
removed and modified big files.
'hg bupdate' - to fetch files from versions directory as recorded in
'.bigfiles', and get a list of necessary files missing in the
version directory.'''
from mercurial.i18n import _
from mercurial.node import *
from mercurial import commands, cmdutil, hg, node, util
import os, stat, cPickle, errno
_sha1 = util.sha1
def parse_bigfiles(repo):
fname = repo.wjoin('.bigfiles')
bigfiles = {}
try:
for str in open(fname):
if '#' not in str:
continue
path, hash = str.strip().rsplit('#', 1)
bigfiles[path] = hash
except IOError, err:
if err.errno != errno.ENOENT: raise
return bigfiles
def bigfiles_repo(ui):
brepo = ui.config('bigfiles', 'repo')
if not brepo:
raise util.Abort(_('bigfiles.repo path not configured'))
try:
st=os.lstat(brepo)
if not stat.S_ISDIR(st.st_mode):
raise util.Abort(
_('specified bigfiles repo %s is not a directory') % brepo)
except OSError:
raise util.Abort(_("can't access bigfiles repo: %s") % brepo)
return brepo
def _hash(f):
#print("getting hash of %s" % f)
file = open(f, 'rb')
s = _sha1("")
while True:
text = file.read(1000000)
if text=='':
break
s.update(text)
return s.hexdigest()
def read_bigfiledirstate(ui, repo):
ds = {}
try:
fp = open(repo.wjoin(".hg/bigfiledirstate"), "rb")
ds = cPickle.load(fp)
fp.close()
except IOError, err:
if err.errno != errno.ENOENT: raise
return ds
def write_bigfiledirstate(ui, repo, ds):
fp = open(repo.wjoin(".hg/bigfiledirstate"), "wb")
cPickle.dump(ds, fp)
fp.close()
def update_bigfiledirstate(repo, file, st, ds):
ds[file] = (st.st_size, st.st_mtime, _hash(repo.wjoin(file)))
def accelerated_hash(repo, file, st, ds):
if file in ds:
t = ds[file]
if t[0] == st.st_size and t[1] == st.st_mtime:
return t[2]
update_bigfiledirstate(repo, file, st, ds)
return ds[file][2]
def _bigstatus(ui, repo, pats, opts, ds, bigfiles):
MAX_SIZE = 10000000
brepo = bigfiles_repo(ui)
tracked_gotbig = [] # not in .bigfiles
added_big = [] # not in .bigfiles
modified = [] # already in .bigfiles
removed = [] # missing, but still in .bigfiles
gotsmall = [] # still in .bigfiles
missinginrepo = [] # file recorded in .bigfiles not in bigfiles repo
node1, node2 = cmdutil.revpair(repo, None)
mod_all, added_all = repo.status(node1, node2,
cmdutil.match(repo, pats, opts), None, None, True)[0:2]
for file in mod_all:
f=repo.wjoin(file)
fsize=os.lstat(f).st_size
if fsize > MAX_SIZE:
tracked_gotbig.append(file)
for file in added_all:
f=repo.wjoin(file)
fsize=os.lstat(f).st_size
if fsize > MAX_SIZE:
added_big.append(file)
for file, hash in bigfiles.iteritems():
f=repo.wjoin(file)
try:
st = os.lstat(f)
except OSError:
frepo = "%s/%s.%s" % (brepo, file, hash)
try:
os.lstat(frepo)
removed.append(file)
except OSError:
missinginrepo.append(file)
continue
if st.st_size <= MAX_SIZE:
gotsmall.append(file)
fhash = accelerated_hash(repo, file, st, ds)
if fhash != hash:
modified.append(file)
return tracked_gotbig, added_big, modified, removed, gotsmall, \
missinginrepo
def bigstatus(ui, repo, *pats, **opts):
'''show changed big files in the working directory
Show status of big files in the repository.
The codes used to show the status of files are:
B = tracked by hg, got too big.
A = added to hg, too big
M = modified
S = got small, can now be tracked by hg
! = missing in working dir, present in versions repo
R = missing in both working dir and versions repo'''
ds = read_bigfiledirstate(ui, repo)
bigfiles = parse_bigfiles(repo)
bst = _bigstatus(ui, repo, pats, opts, ds, bigfiles)
codes = ('B', 'A', 'M', '!', 'S', 'R')
for files, code in zip(bst, codes):
for f in files:
if opts['no_status']:
ui.write("%s\n" % f)
else:
ui.write("%s %s\n" % (code, f))
write_bigfiledirstate(ui, repo, ds)
def _updatebigrepo(ui, repo, files, brepo, bigfiles, ds):
for file in files:
f = repo.wjoin(file)
hash = accelerated_hash(repo, file, os.lstat(f), ds)
bigfiles[file] = hash
rf = "%s/%s.%s" % (brepo, file, hash)
util.makedirs(os.path.dirname(rf))
util.copyfile(f, rf)
def bigrefresh(ui, repo, *pats, **opts):
'''update big files tracking as per working directory.
Added big files get forgotten and added to '.bigfiles' instead.
Removed big files are deleted from '.bigfiles'.
Files tracked by hg that got too big are removed from hg, and added
to '.bigfiles'.
Copies of new and modified big files are stored in versions
directory.'''
ds = read_bigfiledirstate(ui, repo)
bigfiles = parse_bigfiles(repo)
tracked_gotbig, added_big, modified, removed, gotsmall, \
missinginrepo = _bigstatus(ui, repo, pats, opts, ds, bigfiles)
for f in added_big:
ui.write(_("forgetting %s\n") % f)
if not opts['dry_run']:
repo.forget(added_big)
for f in tracked_gotbig:
ui.write(_("removing %s\n") % f)
if not opts['dry_run']:
repo.remove(tracked_gotbig, unlink=False)
for f in removed+missinginrepo:
ui.write(_("recording removal of %s\n") % f)
brepo = bigfiles_repo(ui)
if not opts['dry_run']:
_updatebigrepo(ui, repo, tracked_gotbig + added_big + modified,
brepo, bigfiles, ds)
for file in removed+missinginrepo:
del bigfiles[file]
fp = open(repo.wjoin('.bigfiles'), 'w')
fp.write("syntax: glob\n\n")
for f in util.sort(bigfiles.keys()):
fp.write("%s#%s\n" % (f, bigfiles[f]))
fp.close()
write_bigfiledirstate(ui, repo, ds)
def bigupdate(ui, repo, *pats, **opts):
'''fetch files from versions directory as recorded in '.bigfiles'.
Also complain about necessary files missing in the version directory'''
ds = read_bigfiledirstate(ui, repo)
bigfiles = parse_bigfiles(repo)
tracked_gotbig, added_big, modified, removed, gotsmall, \
missinginrepo = _bigstatus(ui, repo, pats, opts, ds, bigfiles)
brepo = bigfiles_repo(ui)
for file in removed:
f = repo.wjoin(file)
hash= bigfiles[file]
rf = "%s/%s.%s" % (brepo, file, hash)
ui.write(_("fetching %s\n") % rf)
if not opts['dry_run']:
util.makedirs(os.path.dirname(f))
util.copyfile(rf, f)
if missinginrepo:
ui.write(_("\nNeeded files missing in bigrepo %s:\n") % brepo)
for file in missinginrepo:
hash = bigfiles[file]
ui.write("%s.%s\n" % (file, hash))
write_bigfiledirstate(ui, repo, ds)
def my_update(ui, repo, node=None, rev=None, clean=False, date=None):
bigfiles0 = parse_bigfiles(repo)
res = commands.update(ui, repo, node, rev, clean, date)
bigfiles1 = parse_bigfiles(repo)
m1 = None
m2 = None
for file in bigfiles0.keys():
if file not in bigfiles1:
if not m1:
parent1, parent2 = repo.dirstate.parents()
m1 = repo[parent1].manifest()
m2 = repo[parent2].manifest()
if file in m1 or file not in m2:
continue
try:
f = repo.wjoin(file)
os.lstat(f)
ui.write(_("unlinking %s.%s\n") % (file, bigfiles0[file]))
util.unlink(f)
except OSError:
pass
tofetch = {}
for file, hash in bigfiles1.iteritems():
if file not in bigfiles0 or bigfiles0[file] != hash:
tofetch[file] = hash
if tofetch:
brepo = bigfiles_repo(ui)
missing = {}
for file, hash in tofetch.iteritems():
f = repo.wjoin(file)
rf = "%s/%s.%s" % (brepo, file, hash)
try:
os.lstat(rf)
ui.write(_("fetching %s.%s\n") % (file, hash))
util.makedirs(os.path.dirname(f))
try:
os.lstat(f)
util.unlink(f)
except OSError:
pass
util.copyfile(rf, f)
except OSError:
missing[file] = hash
if missing:
ui.write(_("\nNeeded files missing in bigrepo %s:\n") % brepo)
for file, hash in missing.iteritems():
ui.write("%s.%s\n" % (file, hash))
return res
def _findrepo(p):
while not os.path.isdir(os.path.join(p, ".hg")):
oldp, p = p, os.path.dirname(p)
if p == oldp:
return None
return p
def uisetup(ui):
tmp = commands.table["^update|up|checkout|co"]
my_update.__doc__ = tmp[0].__doc__
commands.table["^update|up|checkout|co"] = (my_update, tmp[1], tmp[2])
path_bigfiles = _findrepo(os.getcwd())
if path_bigfiles == None:
return
path_bigfiles = path_bigfiles+'/.bigfiles'
try:
os.stat(path_bigfiles)
if ui.parentui:
ui.parentui.setconfig("ui", "ignore.bigfiles", path_bigfiles)
else:
ui.setconfig("ui", "ignore.bigfiles", path_bigfiles)
except:
pass
cmdtable = {
'bigstatus|bstatus':
(bigstatus,
[('n', 'no-status', None, _('hide status prefix')),
] + commands.walkopts,
_('hg bigstatus [SOURCE]')),
'bigrefresh|brefresh':
(bigrefresh,
[('n', 'dry-run', None, _('do not perform actions, just print
output')),
] + commands.walkopts,
_('hg bigrefresh [SOURCE]')),
'bigupdate|bup|bigcheckout|bco':
(bigupdate,
[('n', 'dry-run', None, _('do not perform actions, just print
output')),
] + commands.walkopts,
_('hg bigupdate [SOURCE]')),
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: bigfiles.py
Type: application/octet-stream
Size: 11963 bytes
Desc: not available
Url : http://selenic.com/pipermail/mercurial/attachments/20081104/29a6f69e/attachment.obj
More information about the Mercurial
mailing list