ffind @ 8de084a9b64d

Implement file size filtering.
author Steve Losh <steve@stevelosh.com>
date Sat, 22 Sep 2012 17:14:49 -0400
parents db67c021a0d9
children 413a89f3b8ce
#!/usr/bin/env python
# -*- coding: utf8 -*-

#    ____  ____  __  ____  __ _  ____  __    _  _     ____  __  __ _  ____
#   (  __)(  _ \(  )(  __)(  ( \(    \(  )  ( \/ )___(  __)(  )(  ( \(    \
#    ) _)  )   / )(  ) _) /    / ) D (/ (_/\ )  /(___)) _)  )( /    / ) D (
#   (__)  (__\_)(__)(____)\_)__)(____/\____/(__/     (__)  (__)\_)__)(____/
#
#                         The friendlier file finder.

import os
import optparse
import string
import sys
import re
from optparse import OptionParser, OptionGroup


# Constants -------------------------------------------------------------------
CASE_SENSITIVE = 1
CASE_INSENSITIVE = 2
CASE_SMART = 3

BYTE = 1
KILOBYTE = 1024 * BYTE
MEGABYTE = 1024 * KILOBYTE
GIGABYTE = 1024 * MEGABYTE
TERABYTE = 1024 * GIGABYTE
PETABYTE = 1024 * TERABYTE

VCS_DIRS = ['.hg', '.git', '.svn']

TYPE_FILE_REAL = 1
TYPE_FILE_SYMLINK = 2
TYPE_DIR_REAL = 3
TYPE_DIR_SYMLINK = 4

TYPES_FILE_REAL = set([TYPE_FILE_REAL])
TYPES_FILE_SYMLINK = set([TYPE_FILE_SYMLINK])
TYPES_DIR_REAL = set([TYPE_DIR_REAL])
TYPES_DIR_SYMLINK = set([TYPE_DIR_SYMLINK])

TYPES_FILE = TYPES_FILE_REAL | TYPES_FILE_SYMLINK
TYPES_DIR = TYPES_DIR_REAL | TYPES_DIR_SYMLINK

TYPES_REAL = TYPES_FILE_REAL | TYPES_DIR_REAL
TYPES_SYMLINK = TYPES_FILE_SYMLINK | TYPES_DIR_SYMLINK

TYPES_ALL = TYPES_FILE | TYPES_DIR


# Regexes ---------------------------------------------------------------------
SIZE_RE = re.compile(r'^(\d+(?:\.\d+)?)([bkmgtp])?[a-z]*$', re.IGNORECASE)


# Global Options --------------------------------------------------------------
# (it's a prototype, shut up)
options = None


# Output ----------------------------------------------------------------------
def out(s, line_ending='\n'):
    sys.stdout.write(s + line_ending)

def err(s):
    sys.stderr.write(s + '\n')

def die(s, exitcode=1):
    err('error: ' + s)
    sys.exit(exitcode)


# Searching! ------------------------------------------------------------------
def get_type(path):
    link = os.path.islink(path)
    dir = os.path.isdir(path)

    if link and dir:
        return TYPE_DIR_SYMLINK
    elif link and not dir:
        return TYPE_FILE_SYMLINK
    elif not link and dir:
        return TYPE_DIR_REAL
    elif not link and not dir:
        return TYPE_FILE_REAL

def should_ignore(basename, path):
    if basename in VCS_DIRS:
        return True

    return False

def match(query, path, basename):
    def _match():
        if options.type != TYPES_ALL:
            if get_type(path) not in options.type:
                return False

        if options.case == CASE_INSENSITIVE:
            if query.lower() not in basename.lower():
                return False
        else:
            if query not in basename:
                return False

        if options.larger_than:
            stat = os.stat(path)
            if stat.st_size < options.larger_than:
                return False

        if options.smaller_than:
            stat = os.stat(path)
            if stat.st_size > options.smaller_than:
                return False

        if not options.binary:
            with open(path) as f:
                if '\0' in f.read(1024):
                    return False

        return True


    result = _match()
    return not result if options.invert else result

def search(query, dir='.', depth=0):
    contents = os.listdir(dir)

    next = []
    for item in contents:
        path = os.path.join(dir, item)
        if not should_ignore(item, path):
            if match(query, path, item):
                out(path, '\0' if options.zero else '\n')

            is_dir = os.path.isdir(path)
            if is_dir:
                if options.follow or not os.path.islink(path):
                    next.append(path)


    if depth < options.depth:
        for d in next:
            search(query, d, depth + 1)


# Option Parsing and Main -----------------------------------------------------
def build_option_parser():
    p = OptionParser("usage: %prog [options] PATTERN")

    # Main options
    p.add_option('-d', '--dir', default='.',
                 help='root the search in DIR (default .)',
                 metavar='DIR')
    p.add_option('-D', '--depth', default='25',
                 help='search at most N directories deep (default 25)',
                 metavar='N')
    p.add_option('-f', '--follow',
                 action='store_true', default=False,
                 help='follow symlinked directories and search their contents')
    p.add_option('-F', '--no-follow',
                 dest='follow', action='store_false',
                 help="don't follow symlinked directories (default)")
    p.add_option('-0', '--print0', dest='zero',
                 action='store_true', default=False,
                 help='separate matches with a null byte in output')
    p.add_option('-l', '--literal',
                 action='store_true', default=False,
                 help='force literal search, even if it looks like a regex')
    p.add_option('-v', '--invert',
                 action='store_true', default=False,
                 help='invert match')

    # Case sensitivity
    g = OptionGroup(p, "Configuring Case Sensitivity")
    g.add_option('-s', '--case-sensitive',
                 dest='case', action='store_const', const=CASE_SENSITIVE,
                 default=CASE_SENSITIVE,
                 help='case sensitive matching (default)')
    g.add_option('-i', '--case-insensitive',
                 dest='case', action='store_const', const=CASE_INSENSITIVE,
                 help='case insensitive matching')
    g.add_option('-S', '--case-smart',
                 dest='case', action='store_const', const=CASE_SMART,
                 help='smart case matching (sensitive if any uppercase chars '
                      'are in the pattern, insensitive otherwise)')
    p.add_option_group(g)

    # Ignoring
    g = OptionGroup(p, "Configuring Ignoring")
    g.add_option('-b', '--binary',
                 dest='binary', action='store_true', default=True,
                 help="allow binary files (default)")
    g.add_option('-B', '--no-binary',
                 dest='binary', action='store_false',
                 help='ignore binary files')
    g.add_option('-r', '--restricted',
                 action='store_true', default=False,
                 help="restricted search (skip VCS directories, "
                      "parse all ignore files) (default)")
    g.add_option('-q', '--semi-restricted',
                 action='store_true', default=False,
                 help="semi-restricted search (don't parse VCS ignore files, "
                      "but still skip VCS directories and parse .ffignore)")
    g.add_option('-u', '--unrestricted',
                 action='store_true', default=False,
                 help="unrestricted search (don't parse ignore files, but "
                      "still skip VCS directories)")
    g.add_option('-a', '--all',
                 action='store_true', default=False,
                 help="don't ignore anything (ALL files can match)")
    g.add_option('-I', '--ignore', metavar='PATTERN',
                 action='append',
                 help="add a pattern to be ignored (can be given multiple times)")
    p.add_option_group(g)

    # Time filtering
    g = OptionGroup(p, "Time Filtering")
    g.add_option('--before',
                 help='match files modified < TIME',
                 metavar='TIME')
    g.add_option('--after',
                 help='match files modified > TIME',
                 metavar='TIME')
    g.add_option('--until',
                 help='match files modified <= TIME',
                 metavar='TIME')
    g.add_option('--since',
                 help='match files modified >= TIME',
                 metavar='TIME')
    g.add_option('--at',
                 help='match files modified at TIME',
                 metavar='TIME')
    g.add_option('--created-before',
                 help='match files created < TIME',
                 metavar='TIME')
    g.add_option('--created-after',
                 help='match files created > TIME',
                 metavar='TIME')
    g.add_option('--created-until',
                 help='match files created <= TIME',
                 metavar='TIME')
    g.add_option('--created-since',
                 help='match files created >= TIME',
                 metavar='TIME')
    g.add_option('--created-at',
                 help='match files created at TIME',
                 metavar='TIME')
    p.add_option_group(g)

    # Size filtering
    g = OptionGroup(p, "Size Filtering",
                    "Sizes can be given as a number followed by a prefix.  Some examples: "
                    "1k, 5kb, 1.5gb, 2g, 1024b")
    g.add_option('--larger-than',
                 help='match files larger than SIZE (inclusive)',
                 metavar='SIZE')
    g.add_option('--bigger-than', dest='larger_than',
                 help=optparse.SUPPRESS_HELP)
    g.add_option('--smaller-than',
                 help='match files smaller than SIZE (inclusive)',
                 metavar='SIZE')
    p.add_option_group(g)

    # Type filtering
    g = OptionGroup(p, "Type Filtering",
                    "Possible types are "
                    "a (all), "
                    "f (files), "
                    "d (dirs), "
                    "r (real), "
                    "s (symlinked), "
                    "e (real files), "
                    "c (real dirs), "
                    "x (symlinked files), "
                    "y (symlinked dirs). "
                    "If multiple types are given they will be unioned together:  "
                    "--type 'es' would match real files and all symlinks.")
    g.add_option('-t', '--type',
                 action='store', default=False, metavar='TYPE(S)',
                 help='match only specific types of things (files, dirs, non-symlinks, symlinks)')
    p.add_option_group(g)

    return p


def build_type_set(types):
    if not types:
        return TYPES_ALL

    result = set()
    for c in types:
        result = result | {
            'a': TYPES_ALL,

            'e': TYPES_FILE_REAL,
            'x': TYPES_FILE_SYMLINK,
            'c': TYPES_DIR_REAL,
            'y': TYPES_DIR_SYMLINK,

            'f': TYPES_FILE,
            'd': TYPES_DIR,

            'r': TYPES_REAL,
            's': TYPES_SYMLINK,
        }[c.lower()]

    return result


def parse_size(size):
    size = size.replace(' ', '') if size else size

    if not size:
        return None

    m = SIZE_RE.match(size)
    if not m:
        die('invalid size "%s"' % size)

    n, unit = m.groups()

    try:
        n = float(n)
    except ValueError:
        die('invalid size "%s"' % size)

    unit = {
        'b': BYTE,
        'k': KILOBYTE,
        'm': MEGABYTE,
        'g': GIGABYTE,
        't': TERABYTE,
        'p': PETABYTE,
    }[unit or 'b']

    return int(n * unit)


def main():
    global options

    (options, args) = build_option_parser().parse_args()

    # PATTERN
    if len(args) > 1:
        die("only one search pattern can be given")
        sys.exit(1)

    query = args[0] if args else ''

    # --dir
    if options.dir:
        try:
            os.chdir(options.dir)
        except OSError:
            die('could not change to directory "%s"' % options.dir)

    # --depth
    try:
        options.depth = int(options.depth)
    except ValueError:
        die('depth must be a non-negative integer (got "%s")' % options.depth)

    # --case-*
    if options.case == CASE_SMART:
        if any(c in string.uppercase for c in query):
            options.case = CASE_SENSITIVE
        else:
            options.case = CASE_INSENSITIVE

    # --type
    options.type = build_type_set(options.type)

    # --larger-than, --smaller-than
    options.larger_than = parse_size(options.larger_than)
    options.smaller_than = parse_size(options.smaller_than)

    # Go!
    search(query)


if __name__ == '__main__':
    main()