#!/usr/bin/python -t
#
# Generating the repository metadata
#
# Copyright 2004 Duke University
# Copyright (C) 2005 Red Hat, Inc.
# Author: Miloslav Trmac
# Based on createrepo 0.4.2
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Library General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#

import re, fnmatch, getopt, gzip, md5, os.path, sha, sys, libxml2

PYRPMDIR = "/usr/share/pyrpm"
if not PYRPMDIR in sys.path:
    sys.path.append(PYRPMDIR)
import pyrpm

__version__ = '0.4.2py'

# done to fix gzip randomly changing the checksum

class GzipFile(gzip.GzipFile):
    def _write_gzip_header(self):
        self.fileobj.write('\037\213')             # magic header
        self.fileobj.write('\010')                 # compression method
        fname = self.filename[:-3]
        flags = 0
        if fname:
            flags = gzip.FNAME
        self.fileobj.write(chr(flags))
        gzip.write32u(self.fileobj, long(0))
        self.fileobj.write('\002')
        self.fileobj.write('\377')
        if fname:
            self.fileobj.write(fname + '\000')

def _gzipOpen(filename, mode="rb", compresslevel=9):
    return GzipFile(filename, mode, compresslevel)


def errorprint(stuff):
    print >> sys.stderr, stuff

def _(args):
    """Stub function for translation"""
    return args

def usage(retval=1):
    print _("""
    createrepo [options] directory-of-packages

    Options:
     -u, --baseurl = optional base url location for all files
     -x, --exclude = files globs to exclude, can be specified multiple times
     -q, --quiet = run quietly
     -g, --groupfile <filename> to point to for group information (precreated)
     -v, --verbose = run verbosely
     -s, --checksum = md5 or sha - select type of checksum to use (default: sha)
     -h, --help = show this help
     -V, --version = output version
     -p, --pretty = output xml files in pretty format.
     -A, --autoglob = extract file and dir globs automatically from filereqs
     -D, --dirglob = specify a directory glob used for trimming filelist, can
                     be specified multiple times
     -F, --fileglob = specify a file glob used for trimming filelist, can be
                     specified multiple times
         --globinfo = yes or no - enable/disable output of file/dir glob info
                      (default no)
    """)

    sys.exit(retval)


def getFileList(path, ext, filelist):
    """Return all files in path matching ext, store them in filelist, recurse dirs
       return list object"""

    extlen = len(ext)
    try:
        dir_list = os.listdir(path)
    except OSError, e:
        errorprint(_('Error accessing directory %s, %s') % (path, e))
        sys.exit(1)

    dir_list.sort()
    for d in dir_list:
        if os.path.isdir(path + '/' + d):
            filelist = getFileList(path + '/' + d, ext, filelist)
        else:
            if d[-extlen:].lower() == ext:
               newpath = os.path.normpath(path + '/' + d)
               filelist.append(newpath)

    return filelist


def trimRpms(rpms, excludeGlobs):
    # print 'Pre-Trim Len: %d' % len(rpms)
    badrpms = []
    for rpm in rpms:
        for glob in excludeGlobs:
            if fnmatch.fnmatch(rpm, glob):
                # print 'excluded: %s' % rpm
                if rpm not in badrpms:
                    badrpms.append(rpm)
                break
    for rpm in badrpms:
        if rpm in rpms:
            rpms.remove(rpm)
    # print 'Post-Trim Len: %d' % len(rpms)
    return rpms

def checkAndMakeDir(dir):
    """
     check out the dir and make it, if possible, return 1 if done, else return 0
    """
    if os.path.exists(dir):
        if not os.path.isdir(dir):
            errorprint(_('%s is not a dir') % dir)
            result = 0
        else:
            if not os.access(dir, os.W_OK):
                errorprint(_('%s is not writable') % dir)
                result = 0
            else:
                result = 1
    else:
        try:
            os.mkdir(dir)
        except OSError, e:
            errorprint(_('Error creating dir %s: %s') % (dir, e))
            result = 0
        else:
            result = 1
    return result


def parseArgs(args):
    """
       Parse the command line args return a commands dict and directory.
       Sanity check all the things being passed in.
    """
    cmds = {}
    cmds['quiet'] = 0
    cmds['verbose'] = 0
    cmds['excludes'] = []
    cmds['baseurl'] = None
    cmds['groupfile'] = None
    cmds['sumtype'] = 'sha'
    cmds['pretty'] = 0
#    cmds['updategroupsonly'] = 0
    cmds['file-pattern-match'] = ['.*bin\/.*', '^\/etc\/.*', '^\/usr\/lib\/sendmail$']
    cmds['dir-pattern-match'] = ['.*bin\/.*', '^\/etc\/.*']
    cmds['globinfo'] = False

    try:
        gopts, argsleft = getopt.getopt(args, 'phqVvg:s:x:u:AD:F:', ['help', 'exclude=',
                                            'quiet', 'verbose',
                                            'baseurl=', 'groupfile=', 'checksum=',
                                            'version', 'pretty', 'autoglob',
                                            'dirglob=', 'fileglob=', 'globinfo='])
    except getopt.error, e:
        errorprint(_('Options Error: %s.') % e)
        usage()

    # Flags needed for overwriting the default values
    found_fileglob = False
    found_dirglob = False

    try:
        for arg,a in gopts:
            if arg in ['-h','--help']:
                usage(retval=0)
            elif arg in ['-V', '--version']:
                print '%s' % __version__
                sys.exit(0)
    except ValueError, e:
        errorprint(_('Options Error: %s') % e)
        usage()


    # make sure our dir makes sense before we continue
    if len(argsleft) > 1:
        errorprint(_('Error: Only one directory allowed per run.'))
        usage()
    elif len(argsleft) == 0:
        errorprint(_('Error: Must specify a directory to index.'))
        usage()
    else:
        directory = argsleft[0]

    try:
        for arg,a in gopts:
            if arg == '-v':
                cmds['verbose'] = 1
            elif arg == "-q":
                cmds['quiet'] = 1
            elif arg in ['-u', '--baseurl']:
                if cmds['baseurl'] is not None:
                    errorprint(_('Error: Only one baseurl allowed.'))
                    usage()
                else:
                    cmds['baseurl'] = a
            elif arg in ['-g', '--groupfile']:
                if cmds['groupfile'] is not None:
                    errorprint(_('Error: Only one groupfile allowed.'))
                    usage()
                else:
                    if os.path.exists(directory + '/' + a):
                        cmds['groupfile'] = a
                    else:
                        errorprint(_('Error: groupfile %s cannot be found.' % a))
                        usage()
            elif arg in ['-x', '--exclude']:
                cmds['excludes'].append(a)
            elif arg in ['-p', '--pretty']:
                cmds['pretty'] = 1
#            elif arg in ['--update-groups-only']:
#                cmds['updategroupsonly'] = 1
            elif arg in ['-s', '--checksum']:
                if a not in ['md5', 'sha']:
                    errorprint(_('Error: checksums are: md5 or sha.'))
                    usage()
                else:
                    cmds['sumtype'] = a
            elif arg in ['-A', '--autoglob']:
                cmds['autoglob'] = True
            elif arg in ['-D', '--dirglob']:
                if not found_dirglob:
                    found_dirglob = True
                    cmds['dir-pattern-match'] = []
                if a == '':
                    cmds['dir-pattern-match'] = []
                else:
                    cmds['dir-pattern-match'].append(a)
            elif arg in ['-F', '--fileglob']:
                if not found_fileglob:
                    found_fileglob = True
                    cmds['file-pattern-match'] = []
                if a == '':
                    cmds['file-pattern-match'] = []
                else:
                    cmds['file-pattern-match'].append(a)
            elif arg in ('--globinfo',):
                a = a.lower()
                if a not in ('yes', 'no'):
                    errorprint(_('Error: globinfo values are: yes or no'))
                    usage()
                else:
                    cmds['globinfo'] = a == 'yes'

    except ValueError, e:
        errorprint(_('Options Error: %s') % e)
        usage()

    return cmds, directory


class MDError(Exception):
    def __init__(self, args=None):
        Exception.__init__(self)
        self.args = args


def getChecksum(sumtype, file):
    """takes filename, hand back Checksum of it
       sumtype = md5 or sha
       filename = /path/to/file"""

    # chunking brazenly lifted from Ryan Tomayko
    opened_here = 0
    try:
        if type(file) is not str:
            fo = file # assume it's a file-like-object
        else:
            opened_here = 1
            fo = open(file, 'r', pyrpm.DIGEST_CHUNK)

        if sumtype == 'md5':
            sum_ = md5.new()
        elif sumtype == 'sha':
            sum_ = sha.new()
        else:
            raise MDError, 'Error Checksumming file, wrong checksum type %s' % sumtype
        pyrpm.updateDigestFromFile(sum_, fo)

        if opened_here:
            fo.close()
            del fo

        return sum_.hexdigest()
    except:
        raise MDError, 'Error opening file for checksum: %s' % file


class OutputFile:
    """One of the output XML files, filled in one subtree at a time."""

    def __init__(self, cmds, cmdstag, tag, ns, pkgcount, otherattrs=''):
        self.doc = libxml2.newDoc("1.0")
        self.roottag = tag
        self.root = self.doc.newChild(None, self.roottag, None)
        self.root.setNs(self.root.newNs(ns, None))
        path = os.path.join(cmds['tempdir'], cmds[cmdstag])
        self.file = _gzipOpen(path, 'w')
        self.file.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        self.file.write('<%s xmlns="%s"%s packages="%s">\n' %
                        (self.roottag, ns, otherattrs, pkgcount))
        self.pretty = cmds['pretty']

    def outputSubtree(self, node):
        """Output node attached to self.root and delete it from self.root"""

        output = node.serialize('UTF-8', self.pretty)
        self.file.write(output)
        self.file.write('\n')
        node.unlinkNode()
        node.freeNode()
        del node

    def finish(self):
        self.file.write('\n</%s>' % self.roottag)
        self.file.close()
        self.doc.freeDoc()


def metadataReadPackage(filename):
    """Read RPM package filename, without verification or reading the payload.

    For convenience only, other metadata* functions work with any RpmPackage
    as long it contains all needed tags.  Raise ValueError on invalid data,
    IOError."""
    return functions.readRpmPackage(pyrpm.rpmconfig, filename, verify=None,
        hdronly=True)


def doPkgMetadata(cmds):
    """all the heavy lifting for the package metadata"""

    # rpms we're going to be dealing with
    files = []
    files = getFileList('./', '.rpm', files)
    files = trimRpms(files, cmds['excludes'])
    pkgcount = len(files)

    # setup the base metadata doc
    primary = OutputFile(cmds, 'primaryfile', 'metadata',
                         'http://linux.duke.edu/metadata/common', pkgcount,
                         ' xmlns:rpm="http://linux.duke.edu/metadata/rpm"')
    formatns = primary.root.newNs('http://linux.duke.edu/metadata/rpm', 'rpm')

    # setup the file list doc
    filelists = OutputFile(cmds, 'filelistsfile', 'filelists',
                           'http://linux.duke.edu/metadata/filelists',
                           pkgcount)

    # setup the other doc
    other = OutputFile(cmds, 'otherfile', 'otherdata',
                       'http://linux.duke.edu/metadata/other', pkgcount)

    # Find all real file requires
    if cmds.has_key('autoglob'):
        cmds['file-pattern-match'] = []
        cmds['dir-pattern-match'] = []
        current = 0
        fhash = {}
        if not cmds['quiet']:
            print "Pass 1: Finding file requires"
        for filename in files:
            current+=1
            try:
                # FIXME: don't open the file twice?
                pkgid = getChecksum(cmds['sumtype'], filename)
                hdr = pyrpm.metadataReadPackage(filename)
                if not cmds['quiet']:
                    if cmds['verbose']:
                        print '%d/%d - %s' % (current, len(files), filename)
                    else:
                        sys.stdout.write('\r' + ' ' * 80)
                        sys.stdout.write("\r%d/%d - %s" % (current, len(files), filename))
                        sys.stdout.flush()
            except (IOError, MDError, ValueError), e:
                errorprint('\n%s - %s' % (e, filename))
                continue
            for regex in hdr["requirename"]:
                if regex.startswith("/"):
                    regex = regex.replace("\\", "\\\\")
                    regex = regex.replace("/", "\\/")
                    regex = regex.replace(".", "\\.")
                    regex = regex.replace("*", "\\*")
                    regex = regex.replace("+", "\\+")
                    regex = regex.replace("^", "\\^")
                    regex = regex.replace("$", "\\$")
                    regex = regex.replace("?", "\\?")
                    regex = regex.replace("{", "\\{")
                    regex = regex.replace("}", "\\}")
                    regex = regex.replace("[", "\\[")
                    regex = regex.replace("]", "\\]")
                    regex = regex.replace("|", "\\|")
                    regex = regex.replace("(", "\\(")
                    regex = regex.replace(")", "\\)")
                    regex = '^' + regex + '$'
                    fhash[regex] = 1
        cmds['dir-pattern-match'].extend(fhash.keys())
        cmds['file-pattern-match'].extend(fhash.keys())

        if not cmds['quiet']:
            print "\nPass 2: Generating repodata files"

    current = 0
    for filename in files:
        current+=1
        try:
            # FIXME: don't open the file twice?
            pkgid = getChecksum(cmds['sumtype'], filename)
            hdr = pyrpm.metadataReadPackage(filename)
        except (IOError, MDError, ValueError), e:
            errorprint('\n%s - %s' % (e, filename))
            continue
        if not cmds['quiet']:
            if cmds['verbose']:
                print '%d/%d - %s' % (current, len(files), filename)
            else:
                sys.stdout.write('\r' + ' ' * 80)
                sys.stdout.write("\r%d/%d - %s" % (current, len(files), filename))
                sys.stdout.flush()
        node = pyrpm.metadataPrimaryNode(primary.root, formatns, hdr, pkgid,
                                         cmds['sumtype'], filename,
                                         cmds['baseurl'])
        primary.outputSubtree(node)

        node = pyrpm.metadataFilelistsNode(filelists.root, hdr, pkgid)
        filelists.outputSubtree(node)

        node = pyrpm.metadataOtherNode(other.root, hdr, pkgid)
        other.outputSubtree(node)

    if not cmds['quiet']:
        print ''

    # save them up to the tmp locations:
    if not cmds['quiet']:
        print _('Saving Primary metadata')
    primary.finish()

    if not cmds['quiet']:
        print _('Saving file lists metadata')
    filelists.finish()

    if not cmds['quiet']:
        print _('Saving other metadata')
    other.finish()


def repoXML(node, cmds):
    """generate the repomd.xml file that stores the info on the other files"""
    sumtype = cmds['sumtype']
    workfiles = [(cmds['otherfile'], 'other',),
                 (cmds['filelistsfile'], 'filelists'),
                 (cmds['primaryfile'], 'primary')]


    for (filename, ftype) in workfiles:
        zfo = _gzipOpen(os.path.join(cmds['tempdir'], filename))
        uncsum = getChecksum(sumtype, zfo)
        zfo.close()
        csum = getChecksum(sumtype, os.path.join(cmds['tempdir'], filename))
        timestamp = os.stat(os.path.join(cmds['tempdir'], filename)).st_mtime
        data = node.newChild(None, 'data', None)
        data.newProp('type', ftype)
        location = data.newChild(None, 'location', None)
        if cmds['baseurl'] is not None:
            location.newProp('xml:base', cmds['baseurl'])
        location.newProp('href', os.path.join(cmds['finaldir'], filename))
        checksum = data.newChild(None, 'checksum', csum)
        checksum.newProp('type', sumtype)
        data.newChild(None, 'timestamp', str(timestamp))
        unchecksum = data.newChild(None, 'open-checksum', uncsum)
        unchecksum.newProp('type', sumtype)
        if ftype == 'primary' and cmds['globinfo']:
            for glob in cmds['file-pattern-match']:
                fglob = data.newChild(None, 'fileglob', glob)
            for glob in cmds['dir-pattern-match']:
                dglob = data.newChild(None, 'dirglob', glob)

    # if we've got a group file then checksum it once and be done
    if cmds['groupfile'] is not None:
        grpfile = cmds['groupfile']
        timestamp = os.stat(grpfile).st_mtime
        sfile = os.path.basename(grpfile)
        fo = open(grpfile, 'r')
        output = open(os.path.join(cmds['tempdir'], sfile), 'w')
        output.write(fo.read())
        output.close()
        fo.seek(0)
        csum = getChecksum(sumtype, fo)
        fo.close()

        data = node.newChild(None, 'data', None)
        data.newProp('type', 'group')
        location = data.newChild(None, 'location', None)
        if cmds['baseurl'] is not None:
            location.newProp('xml:base', cmds['baseurl'])
        location.newProp('href', os.path.join(cmds['finaldir'], sfile))
        checksum = data.newChild(None, 'checksum', csum)
        checksum.newProp('type', sumtype)
        data.newChild(None, 'timestamp', str(timestamp))


def doRepoMetadata(cmds):
    """wrapper to generate the repomd.xml file that stores the info on the other files"""
    repodoc = libxml2.newDoc("1.0")
    reporoot = repodoc.newChild(None, "repomd", None)
    repons = reporoot.newNs('http://linux.duke.edu/metadata/repo', None)
    reporoot.setNs(repons)
    repofilepath = os.path.join(cmds['tempdir'], cmds['repomdfile'])

    repoXML(reporoot, cmds)

    try:
        repodoc.saveFormatFileEnc(repofilepath, 'UTF-8', 1)
    except:
        errorprint(_('Error saving temp file for rep xml: %s') % repofilepath)
        sys.exit(1)

    del repodoc



def main():
    args = sys.argv[1:]
    cmds, directory = parseArgs(args)

    #setup some defaults
    cmds['primaryfile'] = 'primary.xml.gz'
    cmds['filelistsfile'] = 'filelists.xml.gz'
    cmds['otherfile'] = 'other.xml.gz'
    cmds['repomdfile'] = 'repomd.xml'
    cmds['tempdir'] = '.repodata'
    cmds['finaldir'] = 'repodata'
    cmds['olddir'] = '.olddata'

    # save where we are right now
    curdir = os.getcwd()
    # start the sanity/stupidity checks
    if not os.path.exists(directory):
        errorprint(_('Directory must exist'))
        return 1

    if not os.path.isdir(directory):
        errorprint(_('Directory of packages must be a directory.'))
        return 1

    if not os.access(directory, os.W_OK):
        errorprint(_('Directory must be writable.'))
        return 1


    if not checkAndMakeDir(os.path.join(directory, cmds['tempdir'])):
        return 1

    if not checkAndMakeDir(os.path.join(directory, cmds['finaldir'])):
        return 1

    if os.path.exists(os.path.join(directory, cmds['olddir'])):
        errorprint(_('Old data directory exists, please remove: %s') % cmds['olddir'])
        return 1

    # change to the basedir to work from w/i the path - for relative url paths
    os.chdir(directory)

    # make sure we can write to where we want to write to:
    for direc in ['tempdir', 'finaldir']:
        for key in ['primaryfile', 'filelistsfile', 'otherfile', 'repomdfile']:
            filepath = os.path.join(cmds[direc], cmds[key])
            if os.path.exists(filepath):
                if not os.access(filepath, os.W_OK):
                    errorprint(_('error in must be able to write to metadata files:\n  -> %s') % filepath)
                    os.chdir(curdir)
                    usage()

    try:
        doPkgMetadata(cmds)
    except:
        # always clean up your messes
        os.chdir(curdir)
        raise

    try:
        doRepoMetadata(cmds)
    except:
        os.chdir(curdir)
        raise

    if os.path.exists(cmds['finaldir']):
        try:
            os.rename(cmds['finaldir'], cmds['olddir'])
        except:
            errorprint(_('Error moving final to old dir'))
            os.chdir(curdir)
            return 1

    try:
        os.rename(cmds['tempdir'], cmds['finaldir'])
    except:
        errorprint(_('Error moving final metadata into place'))
        # put the old stuff back
        os.rename(cmds['olddir'], cmds['finaldir'])
        os.chdir(curdir)
        return 1

    for key in ['primaryfile', 'filelistsfile', 'otherfile', 'repomdfile', 'groupfile']:
        if cmds[key]:
            fn = os.path.basename(cmds[key])
        else:
            continue
        oldfile = os.path.join(cmds['olddir'], fn)
        if os.path.exists(oldfile):
            try:
                os.remove(oldfile)
            except OSError, e:
                errorprint(_('Could not remove old metadata file: %s') % oldfile)
                errorprint(_('Error was %s') % e)
                os.chdir(curdir)
                return 1

    try:
        os.rmdir(cmds['olddir'])
    except OSError, e:
        errorprint(_('Could not remove old metadata dir: %s') % cmds['olddir'])
        errorprint(_('Error was %s') % e)
        errorprint(_('Please clean up this directory manually.'))
        os.chdir(curdir)

    # take us home mr. data
    os.chdir(curdir)


if __name__ == "__main__":
    pyrpm.run_main(main)

# vim:ts=4:sw=4:showmatch:expandtab
