9626235f7fe1

Second try at git's hilariously-documented ignore format.
[view raw] [browse files]
author Steve Losh <steve@stevelosh.com>
date Thu, 27 Sep 2012 13:22:41 -0400 (2012-09-27)
parents cf6c3851d091
children c4bc191a0127
branches/tags (none)
files ffind

Changes

--- a/ffind	Thu Sep 27 00:00:33 2012 -0400
+++ b/ffind	Thu Sep 27 13:22:41 2012 -0400
@@ -132,64 +132,105 @@
     return lambda s: l in s
 
 def compile_git(line):
+    original_line = line
     pat = ''
 
-    # The following comments are (mostly) from gitignore(5).
-
-    # If the pattern ends with a slash, it is removed for the purpose of the
-    # following description, but it would only find a match with a directory. In
-    # other words, foo/ will match a directory foo and paths underneath it, but
-    # will not match a regular file or a symbolic link foo (this is consistent
-    # with the way how pathspec works in general in git).
-    # directories_only = line.endswith('/')
+    # From man gitignore 5:
+    #     If the pattern ends with a slash, it is removed for the purpose of the
+    #     following description, but it would only find a match with
+    #     a directory. In other words, foo/ will match a directory foo and paths
+    #     underneath it, but will not match a regular file or a symbolic link
+    #     foo (this is consistent with the way how pathspec works in general in
+    #     git).
+    #
+    #     A leading slash matches the beginning of the pathname. For example,
+    #     "/*.c" matches "cat-file.c" but not "mozilla-sha1/sha1.c".
+    #
+    #     If the pattern does not contain a slash /, git treats it as a shell
+    #     glob pattern and checks for a match against the pathname relative to
+    #     the location of the .gitignore file (relative to the toplevel of the
+    #     work tree if not from a .gitignore file).
+    #
+    #     Otherwise, git treats the pattern as a shell glob suitable for
+    #     consumption by fnmatch(3) with the FNM_PATHNAME flag: wildcards in the
+    #     pattern will not match a / in the pathname. For example,
+    #     "Documentation/*.html" matches "Documentation/git.html" but not
+    #     "Documentation/ppc/ppc.html" or "tools/perf/Documentation/perf.html".
+    #
+    # If you can't tell what the hell this means you're not alone, because git's
+    # documentation is fucking inscrutable.  Here's what I've come up with from
+    # trial and error:
+    # 
+    # 0. Patterns ending in a slash will only match directories, and then you
+    #    can ignore that slash for the rest of these rules.
+    # 1. Patterns are shell globs, except * doesn't match / and there's no **.
+    # 2. Patterns without a slash search the basename of the page, for example:
+    #    the 'file.txt' in '/foo/bar/file.txt'.
+    # 3. Patterns with a slash search against the entire path.
+    # 4. All matching must match the entire string it's searching.  For example:
+    #
+    #    'am' will not ignore '/foo/bar/spam'
+    #    it matches against the basename 'spam' but does not match all of it
+    #
+    #    'bar/spam' will not ignore '/foo/bar/spam'
+    #    it matches against the full path (because it has a slash) but does not
+    #    match all of it.
+    # 5. A leading slash doesn't affect the matching, but does turn a
+    #    "pattern with no slash" into a "pattern with a slash".  So:
+    #
+    #    'bar' will ignore '/foo/bar/spam' (actually it'll ignore bar entirely)
+    #    it matches against the basename 'bar' (because there's no slash) when
+    #    at that level
+    #
+    #    '/bar' will not ignore '/foo/bar/spam'
+    #    it matches against the entire path '/foo/bar' (because there is
+    #    a slash) when at that level
 
-    # A leading slash matches the beginning of the pathname. For example, "/*.c"
-    # matches "cat-file.c" but not "mozilla-sha1/sha1.c".
-    if line.startswith('/'):
+    if line.endswith('/'):
+        # TODO: Deal with this.
+        # directories_only = True
+        line = line[:-1]
+
+    has_slash = '/' in line
+
+    line = line.lstrip('/')
+
+    if has_slash:
+        # Patterns with a slash have to match against the entire pathname.  So
+        # they need to be rooted at the beginning.
         pat += '^./'
-        line = line[1:]
+    else:
+        # Patterns without a slash match against just the basename, which we'll
+        # simulate by including the (final) divider in the pattern.
+        pat += '/'
 
-    def _eat_glob(chs):
-        pat = ''
-        while chs:
+    # The rest of the pattern is git's variation on shell globs.
+    # Mostly normal shell globs, but there's no **.
+    chs = list(line)
+    while chs:
+        ch = chs.pop(0)
+        if ch == '?':
+            pat += '.'
+        elif ch == '*':
+            pat += '[^/]*'
+        elif ch == '[':
+            pat += '['
             ch = chs.pop(0)
-            if ch == '?':
-                pat += '.'
-            elif ch == '*':
-                pat += '[^/]*'
-            elif ch == '[':
-                pat += '['
+            while chs and ch != ']':
+                pat += ch
                 ch = chs.pop(0)
-                while chs and ch != ']':
-                    pat += ch
-                    ch = chs.pop(0)
-                pat += ']'
-            else:
-                pat += re.escape(ch)
-        return pat
+            pat += ']'
+        else:
+            pat += re.escape(ch)
 
-    chs = list(line)
-    # I can't tell what the difference is between these two cases because git's
-    # documentation is fucking inscrutable.
-    if '/' not in line:
-        # If the pattern does not contain a slash /, git treats it as a shell
-        # glob pattern and checks for a match against the pathname relative to
-        # the location of the .gitignore file (relative to the toplevel of the
-        # work tree if not from a .gitignore file).
-        pat += _eat_glob(chs)
-    else:
-        # Otherwise, git treats the pattern as a shell glob suitable for
-        # consumption by fnmatch(3) with the FNM_PATHNAME flag: wildcards in the
-        # pattern will not match a / in the pathname. For example,
-        # "Documentation/*.html" matches "Documentation/git.html" but not
-        # "Documentation/ppc/ppc.html" or "tools/perf/Documentation/perf.html".
-        pat += _eat_glob(chs)
+    # Patterns always have the be anchored at the end.
+    pat += '$'
 
     try:
         regex = re.compile(pat)
         return lambda s: regex.search(s)
     except:
-        warn("could not parse gitignore pattern '%s'" % line)
+        warn("could not parse gitignore pattern '%s'" % original_line)
         return lambda s: True