[PATCH RFC] history punching

Chris Mason mason at suse.com
Fri Jun 23 15:58:40 CDT 2006


Hello everyone,

This implements hg clone --punch rev:rev --pull  to punch holes in the 
history of large repos.  It allows you to keep links to old changesets but
remove them from the .hg directory to save space.  A punched repo
is pullable and pushable.

This patch has been around for a while, I just rediffed against tip today
in hopes of finally getting it off my queue.  It is relatively low risk, but
we do need to finalize the way holes are sent over the wire.  The
current code just sends a delta of -1.

-chris

diff -r b200a4775fb9 mercurial/commands.py
--- a/mercurial/commands.py	Thu Jun 22 11:24:08 2006 -0400
+++ b/mercurial/commands.py	Thu Jun 22 13:13:15 2006 -0400
@@ -973,14 +973,21 @@ def clone(ui, source, dest=None, **opts)
 
     else:
         revs = None
+        punch = None
         if opts['rev']:
             if not other.local():
                 error = _("clone -r not supported yet for remote repositories.")
                 raise util.Abort(error)
             else:
                 revs = [other.lookup(rev) for rev in opts['rev']]
+        if opts['punch']:
+            if not other.local():
+                error = _("clone --punch not supported yet for remote repositories.")
+                raise util.Abort(error)
+            else:
+                punch = map(other.lookup, list(revrange(ui, other, opts['punch'])))
         repo = hg.repository(ui, dest, create=1)
-        repo.pull(other, heads = revs)
+        repo.pull(other, heads = revs, punch = punch)
 
     f = repo.opener("hgrc", "w", text=True)
     f.write("[paths]\n")
@@ -2884,6 +2891,8 @@ table = {
          [('U', 'noupdate', None, _('do not update the new working directory')),
           ('r', 'rev', [],
            _('a changeset you would like to have after cloning')),
+          ('p', 'punch', [],
+           _('revisions to trim from the history')),
           ('', 'pull', None, _('use pull protocol to copy metadata')),
           ('e', 'ssh', '', _('specify ssh command to use')),
           ('', 'remotecmd', '',
diff -r b200a4775fb9 mercurial/filelog.py
--- a/mercurial/filelog.py	Thu Jun 22 11:24:08 2006 -0400
+++ b/mercurial/filelog.py	Thu Jun 22 13:13:15 2006 -0400
@@ -94,7 +94,8 @@ class filelog(revlog):
         hist = {}
 
         for r,n in visit:
-            curr = decorate(self.read(n), self.linkrev(n))
+            if not self.punched(self.rev(n)):
+                curr = decorate(self.read(n), self.linkrev(n))
             for p in self.parents(n):
                 if p != nullid:
                     curr = pair(hist[p], curr)
diff -r b200a4775fb9 mercurial/localrepo.py
--- a/mercurial/localrepo.py	Thu Jun 22 11:24:08 2006 -0400
+++ b/mercurial/localrepo.py	Thu Jun 22 13:13:15 2006 -0400
@@ -1097,7 +1097,7 @@ class localrepository(object):
         else:
             return subset
 
-    def pull(self, remote, heads=None, force=False):
+    def pull(self, remote, heads=None, force=False, punch=None):
         l = self.lock()
 
         fetch = self.findincoming(remote, force=force)
@@ -1108,10 +1108,10 @@ class localrepository(object):
             self.ui.status(_("no changes found\n"))
             return 0
 
-        if heads is None:
+        if heads is None and punch is None:
             cg = remote.changegroup(fetch, 'pull')
         else:
-            cg = remote.changegroupsubset(fetch, heads, 'pull')
+            cg = remote.changegroupsubset(fetch, heads, 'pull', punch=punch)
         return self.addchangegroup(cg, 'pull')
 
     def push(self, remote, force=False, revs=None):
@@ -1184,7 +1184,7 @@ class localrepository(object):
             return remote.unbundle(cg, remote_heads, 'push')
         return ret[1]
 
-    def changegroupsubset(self, bases, heads, source):
+    def changegroupsubset(self, bases, heads, source, punch=None):
         """This function generates a changegroup consisting of all the nodes
         that are descendents of any of the bases, and ancestors of any of
         the heads.
@@ -1197,6 +1197,14 @@ class localrepository(object):
         the changegroup a particular filenode or manifestnode belongs to."""
 
         self.hook('preoutgoing', throw=True, source=source)
+
+        # if a regular rev has a punched revision for a parent, we have
+        # to make sure not to punch any file revisions in the regular revision
+        # manifest.  This is recorded in unpunched_revs
+        unpunched_revs = {}
+        punchrevs = {}
+        if punch:
+            punchrevs = dict.fromkeys(punch)
 
         # Set up some initial variables
         # Make it easy to refer to self.changelog
@@ -1238,6 +1246,10 @@ class localrepository(object):
         # Nor do we know which filenodes are missing.
         msng_filenode_set = {}
 
+        # record multiply linked revisions so that we don't punch away
+        # revisions used by changesets that are not punched.
+        many_link_revs = {}
+
         junk = mnfst.index[mnfst.count() - 1] # Get around a bug in lazyindex
         junk = None
 
@@ -1290,11 +1302,17 @@ class localrepository(object):
             # the manifest.
             def collect_manifests_and_files(clnode):
                 c = cl.read(clnode)
+                if punchrevs:
+                    pin_punched_revs(clnode, c)
                 for f in c[3]:
                     # This is to make sure we only have one instance of each
                     # filename string for each filename.
                     changedfileset.setdefault(f, f)
-                msng_mnfst_set.setdefault(c[0], clnode)
+                oldclnode = msng_mnfst_set.get(c[0])
+                if oldclnode:
+                    many_link_revs.setdefault(c[0], [oldclnode]).append(clnode)
+                else:
+                    msng_mnfst_set[c[0]] = clnode
             return collect_manifests_and_files
 
         # Figure out which manifest nodes (of the ones we think might be part
@@ -1349,7 +1367,12 @@ class localrepository(object):
                             ndset = msng_filenode_set.setdefault(f, {})
                             # And set the filenode's changelog node to the
                             # manifest's if it hasn't been set already.
-                            ndset.setdefault(fnode, clnode)
+                            oldcl = ndset.get(fnode)
+                            if oldcl:
+                                many_link_revs.setdefault(fnode,
+                                                        [oldcl]).append(clnode)
+                            else:
+                                ndset[fnode] = clnode
                 else:
                     # Otherwise we need a full manifest.
                     m = mnfst.read(mnfstnode)
@@ -1361,7 +1384,12 @@ class localrepository(object):
                             # See comments above.
                             clnode = msng_mnfst_set[mnfstnode]
                             ndset = msng_filenode_set.setdefault(f, {})
-                            ndset.setdefault(fnode, clnode)
+                            oldcl = ndset.get(fnode)
+                            if oldcl:
+                                many_link_revs.setdefault(fnode,
+                                                        [oldcl]).append(clnode)
+                            else:
+                                ndset[fnode] = clnode
                 # Remember the revision we hope to see next.
                 next_rev[0] = r + 1
             return collect_msng_filenodes
@@ -1389,6 +1417,43 @@ class localrepository(object):
                 return msngset[fnode]
             return lookup_filenode_link
 
+        # this fills in the unpunched_revs hash for every changeset we read
+        def pin_punched_revs(clnode, c):
+            pp = cl.parents(clnode)
+            if clnode in punchrevs:
+                return
+            if pp[0] not in punchrevs and pp[1] not in punchrevs:
+                return
+            m = mnfst.read(c[0])
+            unpunched_revs[c[0]] = 1
+            for x in m:
+                unpunched_revs[m[x]] = 1
+
+        # This takes a of revisions->changeset from a file (or the manifest)
+        # and finds the corresponding changesets in the punchrevs dict.  A
+        # dict of node ids is created for sending down to the 
+        # revlog.group function
+        def build_punchrevs(revs):
+            temp_punch = {}
+            if punchrevs:
+                for x in revs:
+                    # don't include anything mentioned in the unpunched_revs
+                    # dict
+                    if x in unpunched_revs:
+                        continue
+                    clnode = revs[x]
+                    if clnode in punchrevs:
+                        punchit = True
+                        ar = many_link_revs.get(x, [])
+                        for y in ar:
+                            # don't include anything that is referenced
+                            # by a later changeset we did not want to punch
+                            if y not in punchrevs:
+                                punchit = False
+                        if punchit:
+                            temp_punch[x] = 1
+            return temp_punch
+
         # Now that we have all theses utility functions to help out and
         # logically divide up the task, generate the group.
         def gengroup():
@@ -1396,6 +1461,8 @@ class localrepository(object):
             changedfiles = {}
             # Create a changenode group generator that will call our functions
             # back to lookup the owning changenode and collect information.
+            # we don't send the punch list to cl.group because changelog
+            # records are integral to the push/pull process
             group = cl.group(msng_cl_lst, identity,
                              manifest_and_file_collector(changedfiles))
             for chnk in group:
@@ -1409,8 +1476,10 @@ class localrepository(object):
             msng_mnfst_lst.sort(cmp_by_rev_func(mnfst))
             # Create a generator for the manifestnodes that calls our lookup
             # and data collection functions back.
+            temp_punch = build_punchrevs(msng_mnfst_set)
             group = mnfst.group(msng_mnfst_lst, lookup_manifest_link,
-                                filenode_collector(changedfiles))
+                                filenode_collector(changedfiles),
+                                punchrevs=temp_punch)
             for chnk in group:
                 yield chnk
 
@@ -1440,8 +1509,10 @@ class localrepository(object):
                     # Create a group generator and only pass in a changenode
                     # lookup function as we need to collect no information
                     # from filenodes.
+                    temp_punch = build_punchrevs(msng_filenode_set[fname])
                     group = filerevlog.group(msng_filenode_lst,
-                                             lookup_filenode_link_func(fname))
+                                             lookup_filenode_link_func(fname),
+                                             punchrevs=temp_punch)
                     for chnk in group:
                         yield chnk
                 if msng_filenode_set.has_key(fname):
@@ -1479,9 +1550,10 @@ class localrepository(object):
 
         def changed_file_collector(changedfileset):
             def collect_changed_files(clnode):
-                c = cl.read(clnode)
-                for fname in c[3]:
-                    changedfileset[fname] = 1
+                if not cl.punched(cl.rev(clnode)):
+                    c = cl.read(clnode)
+                    for fname in c[3]:
+                        changedfileset[fname] = 1
             return collect_changed_files
 
         def lookuprevlink_func(revlog):
@@ -1990,6 +2062,7 @@ class localrepository(object):
         seen = {}
         self.ui.status(_("checking changesets\n"))
         checksize(self.changelog, "changelog")
+        punched_manifests = {}
 
         for i in range(self.changelog.count()):
             changesets += 1
@@ -2005,6 +2078,10 @@ class localrepository(object):
                 if p not in self.changelog.nodemap:
                     err(_("changeset %s has unknown parent %s") %
                                  (short(n), short(p)))
+
+            if self.changelog.punched(i):
+                continue
+
             try:
                 changes = self.changelog.read(n)
             except KeyboardInterrupt:
@@ -2016,6 +2093,10 @@ class localrepository(object):
 
             neededmanifests[changes[0]] = n
 
+            mftnode = changes[0]
+            if self.manifest.punched(self.manifest.rev(mftnode)):
+                punched_manifests[i] = mftnode
+
             for f in changes[3]:
                 filelinkrevs.setdefault(f, []).append(i)
 
@@ -2043,6 +2124,9 @@ class localrepository(object):
                 if p not in self.manifest.nodemap:
                     err(_("manifest %s has unknown parent %s") %
                         (short(n), short(p)))
+
+            if self.manifest.punched(i):
+                continue
 
             try:
                 delta = mdiff.patchtext(self.manifest.delta(n))
@@ -2073,7 +2157,13 @@ class localrepository(object):
 
         for f in filelinkrevs:
             if f not in filenodes:
-                err(_("file %s in changeset but not in manifest") % f)
+                csets = filelinkrevs[f]
+                failed = False
+                for x in csets:
+                    if x not in punched_manifests:
+                        failed = True
+                if failed:
+                    err(_("file %s in changeset but not in manifest") % f)
 
         self.ui.status(_("checking files\n"))
         ff = filenodes.keys()
@@ -2098,7 +2188,8 @@ class localrepository(object):
                 if n in seen:
                     err(_("%s: duplicate revision %d") % (f, i))
                 if n not in filenodes[f]:
-                    err(_("%s: %d:%s not in manifests") % (f, i, short(n)))
+                    if fl.linkrev(n) not in punched_manifests:
+                        err(_("%s: %d:%s not in manifests") % (f, i, short(n)))
                 else:
                     del filenodes[f][n]
 
@@ -2110,13 +2201,14 @@ class localrepository(object):
                     filelinkrevs[f].remove(flr)
 
                 # verify contents
-                try:
-                    t = fl.read(n)
-                except KeyboardInterrupt:
-                    self.ui.warn(_("interrupted"))
-                    raise
-                except Exception, inst:
-                    err(_("unpacking file %s %s: %s") % (f, short(n), inst))
+                if not fl.punched(i):
+                    try:
+                        t = fl.read(n)
+                    except KeyboardInterrupt:
+                        self.ui.warn(_("interrupted"))
+                        raise
+                    except Exception, inst:
+                        err(_("unpacking file %s %s: %s") % (f, short(n), inst))
 
                 # verify parents
                 (p1, p2) = fl.parents(n)
diff -r b200a4775fb9 mercurial/revlog.py
--- a/mercurial/revlog.py	Thu Jun 22 11:24:08 2006 -0400
+++ b/mercurial/revlog.py	Thu Jun 22 13:13:15 2006 -0400
@@ -421,7 +421,10 @@ class revlog(object):
                 self.nodemap[e[-1]] = n
                 n += 1
                 if inline:
-                    off += e[1]
+                    sz = e[1]
+                    if sz < 0:
+                        sz = 0
+                    off += sz
                     if off > l:
                         # some things don't seek well, just read it
                         fp.read(off - l)
@@ -517,11 +520,21 @@ class revlog(object):
         return l
         """
 
+    def punched(self, rev):
+        if rev < 0:
+            return False
+        if self.index[rev][1] == -1:
+            return True
+        return False
+
     def length(self, rev):
         if rev < 0:
             return 0
         else:
-            return self.index[rev][1]
+            ret = self.index[rev][1]
+            if ret < 0:
+                ret = 0
+            return ret
     def base(self, rev): return (rev < 0) and rev or self.index[rev][-5]
 
     def reachable(self, rev, stop=None):
@@ -762,6 +775,9 @@ class revlog(object):
         return mdiff.patches(t, pl)
 
     def chunk(self, rev, df=None, cachelen=4096):
+        if self.punched(rev):
+            raise RevlogError(_("reading punched revision %d on file %s" %
+                              (rev, self.indexfile)))
         start, length = self.start(rev), self.length(rev)
         inline = self.inlinedata()
         if inline:
@@ -807,6 +823,9 @@ class revlog(object):
         b2 = self.base(rev2)
         if b1 == b2 and rev1 + 1 == rev2:
             return self.chunk(rev2)
+        elif self.punched(rev1):
+            return self.diff("",
+                             self.revision(self.node(rev2)))
         else:
             return self.diff(self.revision(self.node(rev1)),
                              self.revision(self.node(rev2)))
@@ -920,7 +939,7 @@ class revlog(object):
         n = self.count()
         t = n - 1
 
-        if n:
+        if n and not self.punched(t):
             base = self.base(t)
             start = self.start(base)
             end = self.end(t)
@@ -933,7 +952,8 @@ class revlog(object):
 
         # full versions are inserted when the needed deltas
         # become comparable to the uncompressed text
-        if not n or dist > len(text) * 2:
+        if (not n or self.punched(self.base(t)) or self.punched(t)
+            or dist > len(text) * 2):
             data = compress(text)
             l = len(data[1]) + len(data[0])
             base = n
@@ -1059,7 +1079,7 @@ class revlog(object):
                 #print "next x"
                 gx = x.next()
 
-    def group(self, nodelist, lookup, infocollect=None):
+    def group(self, nodelist, lookup, infocollect=None, punchrevs={}):
         """calculate a delta group
 
         Given a list of changeset revs, return a set of deltas and
@@ -1082,12 +1102,21 @@ class revlog(object):
         # build deltas
         for d in xrange(0, len(revs) - 1):
             a, b = revs[d], revs[d + 1]
+            na = self.node(a)
             nb = self.node(b)
 
             if infocollect is not None:
                 infocollect(nb)
 
-            d = self.revdiff(a, b)
+            # if a is punched, we have to send a full revision
+            # of b.
+            if nb in punchrevs or self.punched(b):
+                d = struct.pack(">l", -1)
+            elif na in punchrevs or self.punched(a):
+                d = self.diff("", self.revision(nb))
+            else:
+                d = self.revdiff(a, b)
+
             p = self.parents(nb)
             meta = nb + p[0] + p[1] + lookup(nb)
             yield changegroup.genchunk("%s%s" % (meta, d))
@@ -1134,6 +1163,11 @@ class revlog(object):
                 chain = node
                 continue
             delta = chunk[80:]
+            punched = False
+            if len(delta) == 4:
+                i = struct.unpack(">l", delta)[0]
+                if i == -1:
+                    punched = True
 
             for p in (p1, p2):
                 if not p in self.nodemap:
@@ -1151,33 +1185,55 @@ class revlog(object):
             # the size of the previous full rev as a proxy for the
             # current size.
 
-            if chain == prev:
+            if punched:
+                textlen = -1
+                cdeltalen = -1
+            elif chain and self.punched(self.rev(chain)):
+                # if the chain rev was punched, make sure we add a full revision
+                prev = None
+            elif chain == prev:
                 tempd = compress(delta)
                 cdelta = tempd[0] + tempd[1]
+                cdeltalen = len(cdelta)
                 textlen = mdiff.patchedsize(textlen, delta)
 
-            if chain != prev or (end - start + len(cdelta)) > textlen * 2:
+            # all the logic for adding punched revs goes into the
+            # delta code later down.  Don't send punched revs into
+            # addrevision
+            if not punched and (chain != prev or 
+                (end - start + len(cdelta)) > textlen * 2):
                 # flush our writes here so we can read it in revision
                 if dfh:
                     dfh.flush()
                 ifh.flush()
-                text = self.revision(chain)
-                text = self.patches(text, [delta])
+                if self.punched(self.rev(chain)):
+                    text = self.patches("", [delta])
+                else:
+                    text = self.revision(chain)
+                    text = self.patches(text, [delta])
                 chk = self.addrevision(text, transaction, link, p1, p2)
                 if chk != node:
                     raise RevlogError(_("consistency error adding group"))
                 textlen = len(text)
             else:
+                if base == -1:
+                    # this only happens when punched
+                    base = self.count()
                 if self.version == REVLOGV0:
                     e = (end, len(cdelta), base, link, p1, p2, node)
                 else:
-                    e = (self.offset_type(end, 0), len(cdelta), textlen, base,
+                    e = (self.offset_type(end, 0), cdeltalen, textlen, base,
                          link, self.rev(p1), self.rev(p2), node)
                 self.index.append(e)
                 self.nodemap[node] = r
+                entry = struct.pack(self.indexformat, *e)
+                if len(self.index) == 1 and self.version != REVLOGV0:
+                    l = struct.pack(versionformat, self.version)
+                    entry = l + entry[4:]
                 if self.inlinedata():
-                    ifh.write(struct.pack(self.indexformat, *e))
-                    ifh.write(cdelta)
+                    ifh.write(entry)
+                    if not punched:
+                        ifh.write(cdelta)
                     self.checkinlinesize(transaction, ifh)
                     if not self.inlinedata():
                         dfh = self.opener(self.datafile, "a")
@@ -1188,8 +1244,9 @@ class revlog(object):
                         # reopen the index
                         dfh = self.opener(self.datafile, "a")
                         ifh = self.opener(self.indexfile, "a")
-                    dfh.write(cdelta)
-                    ifh.write(struct.pack(self.indexformat, *e))
+                    if not punched:
+                        dfh.write(cdelta)
+                    ifh.write(entry)
 
             t, r, chain, prev = r, r + 1, node, node
             base = self.base(t)


More information about the Mercurial mailing list