[PATCH 5 of 5 v5] store: implement fncache hash encoding in C

Mon Sep 10 15:35:02 CDT 2012

# HG changeset patch
# User Bryan O'Sullivan <bryano at fb.com>
# Date 1347309263 25200
# Node ID bd3831e77556a584bd581cd8ff7140e3404f6b68
# Parent  c2cc9263d4969b22b4b9fd39f22e97c70fc700a2
store: implement fncache hash encoding in C

The Python implementation of hash encoding is slow enough to affect
the overall performance of Mercurial even when only a small fraction of
paths must be hash encoded.

In a repo containing 160,000 files, where just 1% of files need to be
hash encoded, this patch improves overall encoding performance by 50%
compared to the hybrid C-and-Python encoder, and improves the speedup
compared to pure Python from 14x to 22x.

diff --git a/mercurial/pathencode.c b/mercurial/pathencode.c
--- a/mercurial/pathencode.c
+++ b/mercurial/pathencode.c
@@ -47,6 +47,14 @@
 	DEFAULT, /* byte of a path component after the first */
 };
 
+/* state machine for dir-encoding */
+enum dir_state {
+	DDOT,
+	DH,
+	DHGDI,
+	DDEFAULT,
+};
+
 static inline int isset(const uint32_t bitset[], char c)
 {
 	return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31));
@@ -397,8 +405,251 @@
 		       src + 5, len - 5, 1);
 }
 
+static Py_ssize_t direncode(char *dest, size_t destsize,
+			    const char *src, Py_ssize_t len)
+{
+	enum dir_state state = DDEFAULT;
+	Py_ssize_t i = 5, destlen = 0;
+
+	memcopy(dest, &destlen, destsize, "data/", 5);
+
+	while (i < len) {
+		switch (state) {
+		case DDOT:
+			switch (src[i]) {
+			case 'd':
+			case 'i':
+				state = DHGDI;
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			case 'h':
+				state = DH;
+				charcopy(dest, &destlen, destsize, src[i++]);
+				break;
+			default:
+				state = DDEFAULT;
+				break;
+			}
+			break;
+		case DH:
+			if (src[i] == 'g') {
+				state = DHGDI;
+				charcopy(dest, &destlen, destsize, src[i++]);
+			}
+			else state = DDEFAULT;
+			break;
+		case DHGDI:
+			if (src[i] == '/') {
+				memcopy(dest, &destlen, destsize, ".hg", 3);
+				charcopy(dest, &destlen, destsize, src[i++]);
+			}
+			state = DDEFAULT;
+			break;
+		case DDEFAULT:
+			if (src[i] == '.')
+				state = DDOT;
+			charcopy(dest, &destlen, destsize, src[i++]);
+			break;
+		}
+	}
+
+	return destlen;
+}
+
+static Py_ssize_t auxencode(char *dest, size_t destsize,
+			    const char *src, Py_ssize_t len)
+{
+	static const uint32_t twobytes[8];
+
+	static const uint32_t onebyte[8] = {
+		~0, 0xffffbffe, ~0, ~0, ~0, ~0, ~0, ~0,
+	};
+
+	return _encode(twobytes, onebyte, dest, 0, destsize, src, len, 0);
+}
+
 static const Py_ssize_t maxstorepathlen = 120;
 
+static Py_ssize_t lowerencode(char *dest, size_t destsize,
+			      const char *src, Py_ssize_t len)
+{
+	static const uint32_t onebyte[8] = {
+		1, 0x2bfffbfb, 0xe8000001, 0x2fffffff
+	};
+
+	static const uint32_t lower[8] = { 0, 0, 0x7fffffe };
+
+	Py_ssize_t i = 0, destlen = 0;
+
+	while (i < len) {
+		if (isset(onebyte, src[i]))
+			charcopy(dest, &destlen, destsize, src[i++]);
+		else if (isset(lower, src[i]))
+			charcopy(dest, &destlen, destsize, src[i++] + 32);
+		else
+			escape3(dest, &destlen, destsize, src[i++]);
+	}
+
+	return destlen;
+}
+
+static PyObject *hashmangle(const char *src, Py_ssize_t len, const char sha[20])
+{
+	static const Py_ssize_t dirprefixlen = 8;
+	static const Py_ssize_t maxshortdirslen = 68;
+	char *dest;
+	PyObject *ret;
+
+	Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1;
+	Py_ssize_t destsize, destlen = 0, slop, used;
+
+	while (lastslash >= 0 && src[lastslash] != '/') {
+		if (src[lastslash] == '.' && lastdot == -1)
+			lastdot = lastslash;
+		lastslash--;
+	}
+
+	destsize = 170;
+	if (lastdot >= 0)
+		destsize += len - lastdot - 1;
+
+	ret = PyString_FromStringAndSize(NULL, destsize);
+	if (ret == NULL)
+		return NULL;
+
+	dest = PyString_AS_STRING(ret);
+	memcopy(dest, &destlen, destsize, "dh/", 3);
+
+	for (i = d = p = 0; i < lastslash; i++, p++) {
+		if (src[i] == '/') {
+			char d = destlen > 0 ? dest[destlen - 1] : 0;
+			/* After truncation, a directory name may end
+			   in a space or dot. */
+			if (d == '.' || d == ' ') {
+				destlen--;
+				charcopy(dest, &destlen, destsize, '_');
+			}
+			if (destlen > maxshortdirslen)
+				break;
+			charcopy(dest, &destlen, destsize, src[i]);
+			p = -1;
+		}
+		else if (p < dirprefixlen)
+			charcopy(dest, &destlen, destsize, src[i]);
+	}
+
+	if (destlen > maxshortdirslen + 3)
+		do {
+			destlen--;
+		} while (destlen > 0 && dest[destlen] != '/');
+
+	if (destlen > 3)
+		charcopy(dest, &destlen, destsize, '/');
+
+	used = destlen + 40;
+	if (lastdot >= 0)
+		used += len - lastdot - 1;
+	slop = maxstorepathlen - used;
+	if (slop > 0) {
+		Py_ssize_t basenamelen =
+			lastslash >= 0 ? len - lastslash - 2 : len - 1;
+
+		if (basenamelen > slop)
+			basenamelen = slop;
+		if (basenamelen > 0)
+			memcopy(dest, &destlen, destsize, &src[lastslash + 1],
+				basenamelen);
+	}
+
+	for (i = 0; i < 20; i++)
+		hexencode(dest, &destlen, destsize, sha[i]);
+
+	if (lastdot >= 0)
+		memcopy(dest, &destlen, destsize, &src[lastdot],
+			len - lastdot - 1);
+
+	PyString_GET_SIZE(ret) = destlen;
+
+	return ret;
+}
+
+/*
+ * Avoiding a trip through Python would improve performance by 50%,
+ * but we don't encounter enough long names to be worth the code.
+ */
+static int sha1hash(char hash[20], const char *str, Py_ssize_t len)
+{
+	static PyObject *shafunc;
+	PyObject *shaobj, *hashobj;
+
+	if (shafunc == NULL) {
+		PyObject *util, *name = PyString_FromString("mercurial.util");
+
+		if (name == NULL)
+			return -1;
+
+		util = PyImport_Import(name);
+		Py_DECREF(name);
+
+		if (util == NULL) {
+			PyErr_SetString(PyExc_ImportError, "mercurial.util");
+			return -1;
+		}
+		shafunc = PyObject_GetAttrString(util, "sha1");
+		Py_DECREF(util);
+
+		if (shafunc == NULL) {
+			PyErr_SetString(PyExc_AttributeError,
+					"module 'mercurial.util' has no "
+					"attribute 'sha1'");
+			return -1;
+		}
+	}
+
+	shaobj = PyObject_CallFunction(shafunc, "s#", str, len);
+
+	if (shaobj == NULL)
+		return -1;
+
+	hashobj = PyObject_CallMethod(shaobj, "digest", "");
+	Py_DECREF(shaobj);
+
+	if (!PyString_Check(hashobj) || PyString_GET_SIZE(hashobj) != 20) {
+		PyErr_SetString(PyExc_TypeError,
+				"result of digest is not a 20-byte hash");
+		Py_DECREF(hashobj);
+		return -1;
+	}
+
+	memcpy(hash, PyString_AS_STRING(hashobj), 20);
+	Py_DECREF(hashobj);
+	return 0;
+}
+
+static PyObject *hashencode(const char *src, Py_ssize_t len)
+{
+	const Py_ssize_t baselen = (len - 5) * 3;
+#ifndef _MSC_VER
+	/* alloca is surprisingly slow, so avoid with sane compilers */
+	char dired[baselen];
+	char lowered[baselen];
+	char auxed[baselen];
+#else
+	char *dired = alloca(baselen);
+	char *lowered = alloca(baselen);
+	char *auxed = alloca(baselen);
+#endif
+	Py_ssize_t dirlen, lowerlen, auxlen;
+	char sha[20];
+
+	dirlen = direncode(dired, baselen, src, len);
+	if (sha1hash(sha, dired, dirlen - 1) == -1)
+		return NULL;
+	lowerlen = lowerencode(lowered, baselen, dired + 5, dirlen - 5);
+	auxlen = auxencode(auxed, baselen, lowered, lowerlen);
+	return hashmangle(auxed, auxlen, sha);
+}
+
 PyObject *pathencode(PyObject *self, PyObject *args)
 {
 	Py_ssize_t len, newlen;
@@ -428,10 +679,9 @@
 			basicencode(PyString_AS_STRING(newobj), newlen, path,
 				    len + 1);
 		}
-	} else {
-		newobj = Py_None;
-		Py_INCREF(newobj);
 	}
+	else
+		newobj = hashencode(path, len + 1);
 
 	return newobj;
 }
diff --git a/mercurial/store.py b/mercurial/store.py
--- a/mercurial/store.py
+++ b/mercurial/store.py
@@ -433,15 +433,9 @@
     if 'store' in requirements:
         if 'fncache' in requirements:
             dotreq = 'dotencode' in requirements
-            auxencode = lambda f: _auxencode(f, dotreq)
-            pathencode = dotreq and getattr(parsers, 'pathencode', None)
-            if pathencode:
-                def encode(f):
-                    pe = pathencode(f)
-                    if pe is None:
-                        return _hashencode(f, auxencode)
-                    return pe
-            else:
+            encode = dotreq and getattr(parsers, 'pathencode', None)
+            if not encode:
+                auxencode = lambda f: _auxencode(f, dotreq)
                 encode = lambda f: _hybridencode(f, auxencode)
             return fncachestore(path, openertype, encode)
         return encodedstore(path, openertype)