#!/usr/bin/env python """ This is the indexer script for the Pyblosxom 'lupy_search' plugin. Add this into the crontab. Entry should look like: 0 0 * * * /path/to/lupy_index.py -q Which means 'Run daily at midnight' For help/usage type: /path/to/lupy_index.py -h Uses the following variables from config.py: py['lupy_index']: the absolute path to the index directory py['lupy_file_ext']: a list of file extensions to be included in the index """ # Python imports import os import re import sys import fnmatch # Lupy 0.2.1 imports http://divmod.org/Home/Projects/Lupy/ from lupy.index.indexwriter import IndexWriter from lupy import document class GlobDirectoryWalker: # a forward iterator that traverses a directory tree def __init__(self, directory, pattern="*"): self.stack = [directory] self.pattern = pattern self.files = [] self.index = 0 def __getitem__(self, index): while 1: try: file = self.files[self.index] self.index = self.index + 1 except IndexError: # pop next directory from stack self.directory = self.stack.pop() self.files = os.listdir(self.directory) self.index = 0 else: # got a filename fullname = os.path.join(self.directory, file) if os.path.isdir(fullname) and not os.path.islink(fullname): self.stack.append(fullname) if fnmatch.fnmatch(file, self.pattern): return fullname class Indexer(object): __slots__ = ['indexer', '_file_ext', '_exclude'] def __init__(self, path, file_ext=[], exclude=[], create=False): """Create an indexer, writing and index to the directory B{path}. The boolean flag B{create} determines whether the index is created (overwriting an existing index) or updated""" self._file_ext = file_ext self._exclude = exclude self.indexer = IndexWriter(path, create) def addDoc(self, fname): """Add a document to the index""" # create document d = document.Document() # add a file field containing the path to this file f = document.Keyword('file',fname) d.add(f) # I happen to know that the title is separated # from the story by '\n\n\n', so I can easily get the title # which we store in the title field fp = open(fname,'rb') s = fp.read() title = s.split('\n')[0] if s.startswith("<"): # reset for non html/xml files title = "" if _verbose: if title: print "[%s]" % title else: print "" f = document.Text('title', title) d.add(f) # Here I pass False as the 3rd arg to ensure that # the actual text of s is not stored in the index # the following lines using TextWithReader are # more typical. # strip html parser = Stripper() parser.feed(s) s = parser.gettext() #s = html2text(s) f = document.Text('text', s, False) d.add(f) # Add text of an open file (fp) # This is typically how you add a file to an index # f = document.Text('text', fp) # d.add(f) fp.close() # add doc to index self.indexer.addDocument(d) def index(self, dir): """Recurse through B{dir} and index the files. Call optimize() before closing to merge all of the segments created by indexing. This is an optional step and can be expensive for large indexes. """ for ext in self._file_ext: for file in GlobDirectoryWalker(dir, "*." + ext): relpath = file.replace(dir, "").lstrip(os.sep) relpath = os.path.splitext(relpath)[0] if not relpath in self._exclude: if _verbose: print '[',file,']', self.addDoc(file) # Uncomment the following line to optimize the index. # Have a look in the index dir before you optimize. # You will probably see a dozens of files from # several segments. optimize() merges all the segments # into one. It can be quite an expensive operation, but # it can save space and speed up searches. self.indexer.optimize() self.indexer.close() def main(cfg): datadir = os.path.abspath(cfg['datadir']) indexpath = os.path.abspath(cfg["lupy_index"]) file_ext = cfg['lupy_file_ext'] # create directory for index segs = indexpath.split(os.sep) tsegs = [segs.pop(0), segs.pop(0)] while len(segs) > 0: tsegs.append(segs.pop(0)) path = os.sep.join(tsegs) if not os.path.isdir(path): os.mkdir(path, 0755) if not _quiet: import time tt = time.time() print "Creating index for '%s' in '%s':" % (datadir, indexpath) # create a new index in a directory indexer = Indexer(indexpath, file_ext, [], True) # recursively index the files in a directory indexer.index(datadir) if not _quiet: print "Elapsed time:", time.time() - tt print "" _verbose = False _quiet = False if __name__ == "__main__": import optparse parser = optparse.OptionParser() parser.add_option('-c', '--config', type='string', dest='config', default=None, nargs=1, help='path to your pyblosxom configuration file', metavar='') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='verbosely list files processed', metavar='') parser.add_option('-q', '--quiet', action='store_true', dest='quiet', default=False, help='don\'t write to stdout (overrides -v)', metavar='') options, args = parser.parse_args() _verbose = options.verbose _quiet = options.quiet if _quiet: _verbose = False # add the directory containing config.py to the path conf_dir = "." if options.config: if os.sep in options.config: # it's a path if 'config.py' in options.config: conf_dir = options.config.replace("config.py", "") else: conf_dir = options.config conf_dir = os.path.abspath(conf_dir) sys.path.append(conf_dir) # try to load the config.py try: from config import py as cfg except ImportError: sys.stderr.write("Configuration file could not be loaded.\n\n") sys.stderr.write(parser.format_help() +"\n") sys.exit(2) # check config for required properties config_error = False config_error_msg = "" config_required = ['lupy_index', 'lupy_file_ext'] for conf in config_required: if not conf in cfg: config_error_msg += "Missing required config property '%s'.\n" % conf config_error = True if config_error: sys.stderr.write(config_error_msg) sys.exit(2) # add pyblosxom install dir to the path if 'codebase' in cfg: sys.path.append(cfg['codebase']) # Pyblosxom imports from Pyblosxom.tools import Stripper if not _quiet: print "Using configuration from: %s/config.py\n" % conf_dir main(cfg)