#!/usr/bin/env python2.7 # -*- Mode: Python; coding: utf-8 -*- # pkgsrc indexer # # Usage: ./indexpkg.py -P /path/to/pkgsrc -j12 -m bmake /path/to/pkg.index # # Generates a SQL-queryable index of pkgsrc packages. If it # already exists, pick up where it left off if interrupted, # and/or incrementally update it based on which makefiles have # changed if any. # Copyright (c) 2017--2019 Taylor R. Campbell # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. try: # thanks, python3 import Queue except ImportError: import queue as Queue import apsw import contextlib import ctypes import errno import io import multiprocessing import os import subprocess import sys import traceback APPLICATION_ID = 0x504b4749 # PKGI SCHEMA_1 = ''' PRAGMA user_version = 1; CREATE TABLE top ( ok BOOLEAN NOT NULL ); CREATE TABLE cat_new ( name TEXT NOT NULL PRIMARY KEY ) WITHOUT ROWID; CREATE TABLE cat ( name TEXT NOT NULL PRIMARY KEY ) WITHOUT ROWID; CREATE TABLE pkgpath_new ( cat TEXT NOT NULL REFERENCES cat(name) ON DELETE CASCADE, name TEXT NOT NULL, PRIMARY KEY(cat, name) ) WITHOUT ROWID; CREATE TABLE pkgpath ( id INTEGER NOT NULL PRIMARY KEY, cat TEXT NOT NULL REFERENCES cat(name), name TEXT NOT NULL, homepage TEXT, comment TEXT, license TEXT, onlyfor TEXT, notfor TEXT, owner TEXT, maintainer TEXT, descr_src_id INTEGER REFERENCES src(id), descr_mtime REAL, descr TEXT, prefix TEXT, UNIQUE(cat, name) ); CREATE INDEX pkgpath_i_name ON pkgpath (name); CREATE TABLE pkgbuild ( id INTEGER NOT NULL PRIMARY KEY, pkgbase TEXT NOT NULL, version TEXT NOT NULL, pkgpath_id INTEGER NOT NULL REFERENCES pkgpath(id) ON DELETE CASCADE, pkg_skip_reason TEXT, pkg_fail_reason TEXT, no_bin_on_ftp TEXT, restricted TEXT, maintainer TEXT, use_destdir TEXT, bootstrap_pkg BOOLEAN, usergroup_phase TEXT, multi_version TEXT ); CREATE INDEX pkgbuild_i_pkgbase ON pkgbuild (pkgbase); CREATE TABLE src ( id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, path TEXT NOT NULL UNIQUE ); CREATE TABLE top_src ( src_id INTEGER NOT NULL PRIMARY KEY REFERENCES src(id), mtime REAL ) WITHOUT ROWID; CREATE TABLE cat_src ( cat TEXT NOT NULL REFERENCES cat(name) ON DELETE CASCADE, src_id INTEGER NOT NULL REFERENCES src(id), mtime REAL, PRIMARY KEY(cat, src_id) ) WITHOUT ROWID; CREATE TABLE pkg_src ( pkgpath_id INTEGER NOT NULL REFERENCES pkgpath(id) ON DELETE CASCADE, src_id INTEGER NOT NULL REFERENCES src(id), mtime REAL, PRIMARY KEY(pkgpath_id, src_id) ) WITHOUT ROWID; CREATE TABLE deptype ( name TEXT NOT NULL PRIMARY KEY ) WITHOUT ROWID; INSERT INTO deptype (name) VALUES ('boot'); INSERT INTO deptype (name) VALUES ('tool'); INSERT INTO deptype (name) VALUES ('build'); INSERT INTO deptype (name) VALUES ('run'); CREATE TABLE pkgpath_depends ( pkgpath_id INTEGER NOT NULL REFERENCES pkgpath(id) ON DELETE CASCADE, type TEXT NOT NULL REFERENCES deptype(name), depcat TEXT NOT NULL, depname TEXT NOT NULL, depversion TEXT NOT NULL, PRIMARY KEY(pkgpath_id, type, depcat, depname, depversion) -- No foreign key constraint so (a) we can build the database -- incrementally, and (b) we can gracefully handle errors in -- pkgsrc. But this is what is intended. --FOREIGN KEY(depcat, depname) REFERENCES pkgpath(cat, name) ) WITHOUT ROWID; CREATE TABLE pkgbuild_depends ( pkgbuild_id INTEGER NOT NULL REFERENCES pkgbuild(id) ON DELETE CASCADE, -- pbulk doesn't distinguish dependency types. --type TEXT NOT NULL REFERENCES deptype(name), depcat TEXT NOT NULL, depname TEXT NOT NULL, depversion TEXT NOT NULL, PRIMARY KEY(pkgbuild_id, depcat, depname, depversion) -- No foreign key constraint so (a) we can build the database -- incrementally, and (b) we can gracefully handle errors in -- pkgsrc. But this is what is intended. --FOREIGN KEY(depcat, depname) REFERENCES pkgpath(cat, name) ) WITHOUT ROWID; CREATE TABLE pkgpath_conflicts ( pkgpath_id INTEGER NOT NULL REFERENCES pkgpath(id) ON DELETE CASCADE, -- XXX Split into pkgbase and version. Complicated because of -- multipackages like py27-xxx vs py37-xxx. pattern TEXT NOT NULL, PRIMARY KEY(pkgpath_id, pattern) ) WITHOUT ROWID; CREATE TABLE pkgbuild_cat ( pkgbuild_id INTEGER NOT NULL REFERENCES pkgbuild(id) ON DELETE CASCADE, cat TEXT NOT NULL, -- REFERENCES cat(name) PRIMARY KEY(pkgbuild_id, cat) ) WITHOUT ROWID; ''' @contextlib.contextmanager def open_fd(path, flags): fd = os.open(path, flags) try: yield fd finally: os.close(fd) @contextlib.contextmanager def transaction(db, type=None): cursor = db.cursor() cursor.execute('BEGIN %s' % ('' if type is None else type,)) ok = False try: yield ok = True finally: cursor.execute('COMMIT' if ok else 'ROLLBACK') def fetchvalue(cursor, nullok=None): if nullok is None: nullok = False try: row = next(cursor) except StopIteration: if nullok: return None raise ValueError('Empty cursor should be nonempty') if len(row) != 1: raise ValueError('Multi-column cursor should be single-column') try: row1 = next(cursor) except StopIteration: pass else: raise ValueError( 'Multi-row cursor should be single-row: %r, %r' % (row, row1)) return row[0] def process_join(p): p.join() # while p.is_alive(): # p.join(timeout=1000) def queue_get(q): while True: try: return q.get(timeout=1000) except Queue.Empty: pass def queue_put(q, x): q.put(x, timeout=1000) def queue_done(q): #q.task_done() pass def task_scheduler(handler, njobs, dry_run): if dry_run: return dry_task_scheduler(handler) elif njobs == 1: return serial_task_scheduler(handler) else: return parallel_task_scheduler(handler, njobs) @contextlib.contextmanager def dry_task_scheduler(handler): def dry_handler(task): pass yield dry_handler @contextlib.contextmanager def serial_task_scheduler(handler): yield handler @contextlib.contextmanager def parallel_task_scheduler(handler, njobs): q = multiprocessing.Queue(1) exit_flag = multiprocessing.Value(ctypes.c_bool) error_flag = multiprocessing.Value(ctypes.c_bool) workers = [] def work(): try: while not exit_flag.value: task = queue_get(q) if exit_flag.value or task is None: queue_done(q) break handler(task) except Exception as e: sys.stderr.write('%r: %s\n' % (os.getpid(), traceback.format_exc())) exit_flag.value = True error_flag.value = True queue_done(q) def spawn_worker(): worker = multiprocessing.Process(target=work) worker.start() workers.append(worker) def schedule(task): if exit_flag.value: return try: q.put_nowait(task) except Queue.Full: if exit_flag.value: return if len(workers) < njobs: spawn_worker() while not exit_flag.value: try: queue_put(q, task) except Queue.Full: continue else: break spawn_worker() try: yield schedule finally: if not exit_flag.value: for _ in workers: queue_put(q, None) if exit_flag.value: break for i, worker in enumerate(workers): if not exit_flag.value: process_join(worker) if error_flag.value: raise Exception('error') # -d options DBG_SQL = 1 DBG_OODATE = 2 DBG_SRCALIAS = 4 DBG_SUMMARY = 8 DBGFLAGS = { 'A': -1, 'a': DBG_SRCALIAS, 'd': DBG_OODATE, 'q': DBG_SQL, 's': DBG_SUMMARY, } class Indexer(object): def __init__( self, dbpath, pkgsrcdir, dbgout, dbgflags, msgout, errout, prog_bmake, njobs, dry_run): self._dbpath = dbpath self._pkgsrcdir = pkgsrcdir self._dbgout = dbgout self._dbgflags = dbgflags self._msgout = msgout self._errout = errout self._prog_bmake = prog_bmake self._njobs = njobs self._dry_run = dry_run self._lock = multiprocessing.Lock() self.__db = None self._pid = 0 @property def _db(self): pid = os.getpid() if self._pid != pid or self.__db is None: flags = 0 if self._dry_run: flags |= apsw.SQLITE_OPEN_READONLY try: self.__db = apsw.Connection(self._dbpath, flags=flags) except apsw.CantOpenError: self._msg('no database') self.__db = False self._pid = pid return False else: flags |= apsw.SQLITE_OPEN_READWRITE flags |= apsw.SQLITE_OPEN_CREATE self.__db = apsw.Connection(self._dbpath, flags=flags) self._pid = pid self._db_initialize() return self.__db def _db_initialize(self): # Wait up to 10 sec to lock the database. self._db.setbusytimeout(10000) # Create the os_mtime function for detecting changes. self._db.createscalarfunction('os_mtime', self._os_mtime, 1) # Enable foreign key constraints. self._sql('PRAGMA foreign_keys = ON') # Limit the unused size of the journal. Note that this does # not limit the total size for active transactions. if not self._dry_run: self._sql('PRAGMA journal_size_limit = 1000000') def db_check(self): self._msg('check database integrity') if self._db: self._sql('PRAGMA integrity_check') self._sql('PRAGMA foreign_key_check') def _os_mtime(self, path): try: return os.path.getmtime(path) except OSError as e: if e.errno == errno.ENOENT: return None raise def db_schema(self): if not self._db: assert self._dry_run return application_id = self._sql1('PRAGMA application_id', nullok=True) user_version = self._sql1('PRAGMA user_version', nullok=True) if application_id is None: raise EnvironmentError('sqlite3 too old for application_id') if application_id == 0 and user_version == 0: self._msg('create database') if not self._dry_run: self._sql('PRAGMA journal_mode = WAL') with transaction(self._db): self._sql('PRAGMA application_id = %d' % (APPLICATION_ID,)) self._sql(SCHEMA_1) user_version = 1 elif application_id != APPLICATION_ID: raise IOError('Invalid application id: %d' % (application_id,)) if user_version != 1: raise IOError('Invalid pkgindex version: %d' % (user_version,)) def index(self): if not self._db: assert self._dry_run return # Remove unreferenced sources so that we don't waste time # checking them. self._gc(False) # Stat all the source files. self._sql(''' CREATE TEMP TABLE src_mtime ( id INTEGER PRIMARY KEY, mtime REAL NOT NULL ) ''') self._sql(''' INSERT INTO src_mtime (id, mtime) SELECT id, os_mtime(path) FROM src ''') reindexed_top = 0 reindexed_cat = 0 reindexed_pkg = 0 reindexed_descr = 0 # Index the list of categories if it hasn't been indexed or if # it may have changed. if self._sql1('SELECT COUNT(ok) FROM top WHERE ok'): index_categories = self._sql1(''' SELECT COUNT(*) FROM top_src AS tm, src_mtime AS mm WHERE tm.src_id = mm.id AND tm.mtime != mm.mtime ''') else: index_categories = True if index_categories: self._index_categories() reindexed_top += 1 # Index each category if it hasn't been indexed or if it may # have changed. with task_scheduler(self._index_category, self._njobs, self._dry_run) \ as schedule: if self._dbgflags & DBG_OODATE: for cat in self._sql('SELECT name FROM cat_new'): self._dbg(DBG_OODATE, 'category %s new', cat) for (cat, path, otime, ntime) in self._sql(''' SELECT cat.name, src.path, strftime('%Y-%m-%dT%H:%M:%fZ', cat_src.mtime, 'unixepoch'), strftime('%Y-%m-%dT%H:%M:%fZ', src_mtime.mtime, 'unixepoch') FROM cat, cat_src, src_mtime, src WHERE cat.name = cat_src.cat AND cat_src.src_id = src_mtime.id AND cat_src.mtime != src_mtime.mtime AND src_mtime.id = src.id '''): self._dbg(DBG_OODATE, 'category %s out-of-date %s %s -> %s', cat, path, otime, ntime) cursor = self._sql(''' SELECT name FROM cat_new UNION SELECT cat.name FROM cat, cat_src, src_mtime WHERE cat.name = cat_src.cat AND cat_src.src_id = src_mtime.id AND cat_src.mtime != src_mtime.mtime ''') for (cat,) in cursor.fetchall(): schedule(cat) reindexed_cat += 1 # Index each package if it hasn't been indexed or if it may # have changed. with task_scheduler(self._index_pkg, self._njobs, self._dry_run) \ as schedule: if self._dbgflags & DBG_OODATE: for (cat, name) in \ self._sql('SELECT cat, name FROM pkgpath_new'): self._dbg(DBG_OODATE, 'package %s/%s: new', cat, name) for (cat, name, path, otime, ntime) in self._sql(''' SELECT pkgpath.cat, pkgpath.name, src.path, strftime('%Y-%m-%dT%H:%M:%fZ', pkg_src.mtime, 'unixepoch'), strftime('%Y-%m-%dT%H:%M:%fZ', src_mtime.mtime, 'unixepoch') FROM pkgpath, pkg_src, src_mtime, src WHERE pkgpath.id = pkg_src.pkgpath_id AND pkg_src.src_id = src_mtime.id AND pkg_src.mtime != src_mtime.mtime AND src_mtime.id = src.id '''): self._dbg(DBG_OODATE, 'package %s/%s out-of-date %s %s -> %s', cat, name, path, otime, ntime) cursor = self._sql(''' SELECT cat, name FROM pkgpath_new UNION SELECT pkgpath.cat, pkgpath.name FROM pkgpath, pkg_src, src_mtime WHERE pkgpath.id = pkg_src.pkgpath_id AND pkg_src.src_id = src_mtime.id AND pkg_src.mtime != src_mtime.mtime ''') # Fetch all outputs before scheduling any worker processes # in order to avoid a long-running reader process while we # write to the database, which would prevent truncating the # write-ahead log, or block the writers altogether if we # used a rollback log instead. for catpkg in cursor.fetchall(): schedule(catpkg) reindexed_pkg += 1 # Gather any missing DESCR files. with task_scheduler(self._get_descr, self._njobs, self._dry_run) \ as schedule: for catpkg in self._sql(''' SELECT pkgpath.cat, pkgpath.name FROM pkgpath WHERE pkgpath.descr_mtime IS NULL OR EXISTS (SELECT * FROM src_mtime WHERE (pkgpath.descr_src_id = src_mtime.id AND pkgpath.descr_mtime != src_mtime.mtime)) ''').fetchall(): schedule(catpkg) reindexed_descr += 1 self._msg('reindexed %d categories, %d packages, %d DESCR files', reindexed_cat, reindexed_pkg, reindexed_descr) vacuum_p = False vacuum_p |= reindexed_top vacuum_p |= reindexed_cat vacuum_p |= reindexed_pkg vacuum_p |= reindexed_descr # Now that things may have been reindexed, remove unreferenced # sources again. self._gc(vacuum_p) def orphan_dependencies(self): if not self._db: return [] # XXX Do pkgbuild_depends. cursor = self._sql(''' SELECT pkgpath.cat||'/'||pkgpath.name, pkgpath_depends.type, pkgpath_depends.depcat||'/'||pkgpath_depends.depname FROM pkgpath, pkgpath_depends WHERE pkgpath.id = pkgpath_depends.pkgpath_id AND pkgpath_depends.depcat||'/'||pkgpath_depends.depname NOT IN (SELECT cat||'/'||name FROM pkgpath) ''') return cursor.fetchall() def duplicate_pkgpaths(self): if not self._db: return [] classes = [] for (name,) in self._sql(''' SELECT DISTINCT p0.name FROM pkgpath AS p0 JOIN pkgpath AS p1 USING (name) WHERE p0.id < p1.id -- ~Half work vs !=. ORDER BY p0.name ASC '''): cursor = self._sql(''' SELECT cat FROM pkgpath WHERE name = ? ''', (name,)) classes.append((name, [cat for (cat,) in cursor])) return classes def _gc(self, vacuum_p): self._msg('remove unreferenced sources') if not self._dry_run: # Mark & sweep. with self._transaction(): self._sql( 'CREATE TEMP TABLE src_marked (id INTEGER PRIMARY KEY)') for tab, col in [ ('top_src', 'src_id'), ('cat_src', 'src_id'), ('pkg_src', 'src_id'), ('pkgpath', 'descr_src_id'), ]: self._sql(''' INSERT OR IGNORE INTO src_marked (id) SELECT %s FROM %s ''' % (col, tab)) before = self._db.totalchanges() self._sql('DELETE FROM src WHERE id NOT IN src_marked') after = self._db.totalchanges() self._msg('removed %d unreferenced sources', after - before) self._sql('DROP TABLE src_marked') if vacuum_p: self._msg('vacuum') if not self._dry_run: self._sql('VACUUM') self._msg('analyze') if not self._dry_run: self._sql('ANALYZE') self._msg('checkpoint/truncate journal') if not self._dry_run: self._sql('PRAGMA wal_checkpoint(TRUNCATE)') def _index_categories(self): self._msg('index categories') # Get the list of categories and the list of makefiles that # contributed to it. cats, top_srcs = self._list_subdirs() # Index the list of categories in a single transaction. with self._transaction(): # Empty the current category schedule -- we will refill it. self._sql('DELETE FROM cat_new') # Replace the top-level makefile dependencies. self._sql('DELETE FROM top_src') for src in top_srcs: src_id = self._src_id(src) # XXX Race: We can't get the mtime until we know the # source path. But by the time we get the mtime, the # makefile source may have changed since we invoked # make. Only way around is to teach make to fstat all # its own makefiles for ${.MAKE.MAKEFILES}... self._sql(''' INSERT OR IGNORE INTO top_src (src_id, mtime) VALUES (:src_id, os_mtime(:src)) ''', {'src_id': src_id, 'src': src}) # Create a temporary table for the new list of categories. self._sql(''' CREATE TEMP TABLE cat_tmp (name TEXT PRIMARY KEY NOT NULL) WITHOUT ROWID ''') for cat in cats: self._sql('INSERT INTO cat_tmp (name) VALUES (?)', (cat,)) # Delete any packages in categories that no longer exist, # and then delete the categories now that they are # unreferenced. (If a package moved from one category to # another, tough -- we'll just have to rescan it.) self._sql('DELETE FROM pkgpath WHERE cat NOT IN cat_tmp') self._sql('DELETE FROM cat WHERE name NOT IN cat_tmp') # Schedule any new categories. self._sql(''' INSERT INTO cat_new (name) SELECT name FROM cat_tmp WHERE name NOT IN (SELECT name FROM cat) ''') # Delete the temporary table. self._sql('DROP TABLE cat_tmp') # Done. Note that we have fully indexed a snapshot of the # list of categories. self._sql('INSERT OR IGNORE INTO top (ok) VALUES (1)') def _index_category(self, cat): self._msg('index category %s', cat) # Get the list of packages and the list of makefiles that # contributed to it. pkgnames, cat_srcs = self._list_subdirs(cat) # Index the category in a single transaction. with self._transaction(): # Empty the current package schedule for this category -- # we will refill it. self._sql('DELETE FROM pkgpath_new WHERE cat = ?', (cat,)) # Insert this category if it's not already there. self._sql('INSERT OR IGNORE INTO cat (name) VALUES (?)', (cat,)) # Replace this category's sources. Some sources may be # referred to by different pathnames, so INSERT OR IGNORE # in case there are duplicates. self._sql('DELETE FROM cat_src WHERE cat = ?', (cat,)) for src in cat_srcs: src_id = self._src_id(src) self._sql(''' INSERT OR IGNORE INTO cat_src (cat, src_id, mtime) VALUES (:cat, :src_id, os_mtime(:src)) ''', {'cat': cat, 'src_id': src_id, 'src': src}) # Create a temporary table for the new list of packages. self._sql(''' CREATE TEMP TABLE pkgname_tmp (name TEXT NOT NULL PRIMARY KEY) WITHOUT ROWID ''') for pkgname in pkgnames: self._sql('INSERT INTO pkgname_tmp (name) VALUES (?)', (pkgname,)) # Delete the packages that no longer exist. self._sql(''' DELETE FROM pkgpath WHERE cat = ? AND name NOT IN pkgname_tmp ''', (cat,)) # Schedule any new packages. self._sql(''' INSERT INTO pkgpath_new (cat, name) SELECT :cat, t.name FROM pkgname_tmp AS t WHERE NOT (SELECT COUNT(*) FROM pkgpath WHERE cat = :cat AND name = t.name) ''', {'cat': cat}) # Delete the temporary table. self._sql('DROP TABLE pkgname_tmp') # Done. Remove from schedule now that we have fully # indexed a snapshot of the list of packages in this # category. self._sql('DELETE FROM cat_new WHERE name = ?', (cat,)) def _index_pkg(self, catpkg): cat, pkg = catpkg pkgpath = '%s/%s' % (cat, pkg) self._msg('index package %s', pkgpath) # Get the package summary and the list of makefiles that # contributed to it. summary, pbulk_index, srcs = self._summarize_pkg(pkgpath) # Index the package summary in a single transaction. with self._transaction(): # Forget everything about this package, cascading foreign # key deletions. self._sql(''' DELETE FROM pkgpath WHERE cat = ? AND name = ? ''', (cat, pkg)) # Insert this package afresh. self._sql(''' INSERT INTO pkgpath (cat, name) VALUES (?, ?) ''', (cat, pkg)) pkgpath_id = self._db.last_insert_rowid() # Insert this package's sources. Some sources may be # referred to by different pathnames, so INSERT OR IGNORE # in case there are duplicates. for src in srcs: src_id = self._src_id(src) self._sql(''' INSERT OR IGNORE INTO pkg_src (pkgpath_id, src_id, mtime) VALUES (:pkgpath_id, :src_id, os_mtime(:src)) ''', {'pkgpath_id': pkgpath_id, 'src_id': src_id, 'src': src}) # Process the summary. self._process_summary(pkgpath, pkgpath_id, summary) # Process the pbulk index. self._process_pbulk_index(pkgpath, pkgpath_id, pbulk_index) self._dbg(DBG_SUMMARY, 'package %s done', pkgpath) # All done. Remove from schedule. self._sql( 'DELETE FROM pkgpath_new WHERE cat = ? AND name = ?', (cat, pkg)) def _get_descr(self, catpkg): cat, pkg = catpkg pkgpath = '%s/%s' % (cat, pkg) self._msg('describe package %s', pkgpath) # Get the DESCR pathname and mtime. cursor = self._sql(''' SELECT src.id, src.path, os_mtime(src.path) FROM pkgpath, src WHERE pkgpath.cat = ? AND pkgpath.name = ? AND pkgpath.descr_src_id = src.id ''', (cat, pkg)) try: src_id, descr_path, mtime = next(cursor) except StopIteration: raise Exception('failed to get DESCR path and mtime') try: row1 = next(cursor) except StopIteration: pass else: raise Exception('excess DESCR path and mtime: %r' % (row1,)) # Read the DESCR file. with io.open(descr_path, mode='r', encoding='utf8') as f: descr = f.read() # Update it and remember the mtime we had earlier. If the # DESCR file changed in the interim, leave no effect. with self._updates(1): self._sql(''' UPDATE pkgpath SET descr = ?, descr_mtime = ? WHERE cat = ? AND name = ? AND descr_src_id = ? ''', (descr, mtime, cat, pkg, src_id)) def _src_id(self, src_path): # Canonicalize the source pathname relative to $PKGSRCDIR. src_rel = os.path.relpath(src_path, self._pkgsrcdir) if src_rel.startswith(os.path.join(os.pardir, '')): if src_path.startswith(os.path.join(os.pardir, '')): raise Exception('relative path outside $PKGSRCDIR: %r') src_rel = src_path # Check for an existing source id. src_id = self._sql1( 'SELECT id FROM src WHERE path = ?', (src_rel,), nullok=True) if src_id is None: # None found. Assign one. try: self._sql('INSERT INTO src (path) VALUES (?)', (src_rel,)) except apsw.ConstraintError: self._err('no such source file: %r', src_rel) raise src_id = self._db.last_insert_rowid() self._dbg(DBG_SRCALIAS, 'src %r %r %r', src_id, src_rel, src_path) return src_id def _list_subdirs(self, subdir=None): # Determine a working directory for make, relative to # $PKGSRCDIR. wd = self._pkgsrcdir if subdir is not None: wd = os.path.join(wd, subdir) # Can't use show-var because it descends into subdirectories. cmd = ['-V', '.MAKE.MAKEFILES', '-V', '${SUBDIR}'] with self._bmake_output(cmd, wd) as out: try: mline = next(out).decode('utf8') except StopIteration: raise Exception('bmake failure for: -V .MAKE.MAKEFILES') try: sline = next(out).decode('utf8') except StopIteration: raise Exception("bmake failure for: -V '${SUBDIR}'") mline = mline.rstrip('\n') srcs = [os.path.join(wd, m) for m in mline.split(' ') if m] sline = sline.rstrip('\n') subdirs = [s for s in sline.split(' ') if s] return subdirs, srcs def _summarize_pkg(self, pkgpath): def parse_summary(line): # SP LF? # SP SP LF? s0 = 0 e0 = line.find(' ', s0) if e0 == -1: self._err('%s: invalid summary line: %r', pkgpath, line) return '*error*', '*error*', ['*error*'] s1 = e0 + 1 e1 = line.find(' ', s1) if e1 == -1: e1 = len(line) if line.endswith('\n'): e1 -= 1 return line[s0:e0], line[s1:e1], '' s2 = e1 + 1 e2 = len(line) if line.endswith('\n'): e2 -= 1 return line[s0:e0], line[s1:e1], line[s2:e2] def parse_pbulk(line): # `=' LF? s0 = 0 e0 = line.find('=', s0) if e0 == -1: self._err('%s: invalid pbulk line: %r', pkgpath, line) return '*error*', '*error*' s1 = e0 + 1 e1 = len(line) if line.endswith('\n'): e1 -= 1 return line[s0:e0], line[s1:e1] wd = os.path.join(self._pkgsrcdir, pkgpath) cmd = [ 'show-var', 'VARNAME=.MAKE.MAKEFILES', 'print-summary-data', 'pbulk-index', '_MAKEVARS_MK=1', # Avoid depending on $WRKDIR/.*_makevars.mk. ] with self._bmake_output(cmd, wd) as out: # Parse the list of makefiles used directly for # print-summary-data and pbulk-index. Note that # pbulk-index may recursively call make with different # options, so these are not necessarily the only sources. try: line = next(out).decode('utf8') except StopIteration: raise Exception('bmake failed to print .MAKE.MAKEFILES') line = line.rstrip('\n') srcs = [os.path.join(pkgpath, m) for m in line.split(' ')] # Parse the summary, up to the first PKGNAME= line. summary = [] for line in (line.decode('utf8') for line in out): # Are we at the pbulk-index? if line.startswith('PKGNAME='): break # Nope, get a summary line. summary.append(parse_summary(line)) else: raise Exception('bmake failed to show pbulk-index') # Parse the pbulk index. assert line.startswith('PKGNAME=') key, val = parse_pbulk(line) assert key == 'PKGNAME' pbulk_index = [] scan_depends = [] while True: pkgname = val pkgbuild = [] for line in (line.decode('utf8') for line in out): key, val = parse_pbulk(line) if key == 'PKGNAME': pbulk_index.append((pkgname, pkgbuild)) pkgname = val pkgbuild = [] break elif key == 'SCAN_DEPENDS': scan_depends += \ [os.path.join(pkgpath, p) for p in val.split()] else: pkgbuild.append((key, val)) else: # We come here if we have exhausted the input. # Gather the last entry and break out of the while # loop. pbulk_index.append((pkgname, pkgbuild)) break return summary, pbulk_index, srcs + scan_depends def _process_summary(self, pkgpath, pkgpath_id, summary): for cmd, pkgpath_, args in summary: assert pkgpath_ == pkgpath, '%r =/= %r' % (pkgpath_, pkgpath) if cmd in self._SUMMARYCMDTAB: self._dbg(DBG_SUMMARY, 'package %s summary %s args %r', pkgpath, cmd, args) self._SUMMARYCMDTAB[cmd](self, pkgpath, pkgpath_id, args) else: self._dbg(DBG_SUMMARY, 'package %s ignore %s args %r', pkgpath, cmd, args) def _parsedep(self, dep): depcolon = dep.find(':') if depcolon == -1: return None, None depversion = dep[:depcolon] deprelpath = dep[depcolon+1:] if not deprelpath.startswith('../../'): return None, None deppkgpath = deprelpath[len('../../'):] depslash = deppkgpath.find('/') if depslash == -1: return None, None depcat = deppkgpath[:depslash] depname = deppkgpath[depslash+1:] return depcat, depname, depversion def _summarycmd_depends(deptype): def cmd(self, pkgpath, pkgpath_id, args): for dep in args.split(): depcat, depname, depversion = self._parsedep(dep) if depcat is None: self._err('%s: invalid dependency: %s', pkgpath, dep) continue self._sql(''' INSERT OR IGNORE INTO pkgpath_depends (pkgpath_id, type, depcat, depname, depversion) VALUES (?, ?, ?, ?, ?) ''', (pkgpath_id, deptype, depcat, depname, depversion)) cmd.__name__ = 'summarycmd_depends(%s)' % (deptype,) return cmd def _summarycmd_conflicts(self, pkgpath, pkgpath_id, args): # XXX Parse the conflicts. for pattern in args.split(): self._sql(''' INSERT OR IGNORE INTO pkgpath_conflicts (pkgpath_id, pattern) VALUES (?, ?) ''', (pkgpath_id, pattern)) def _summarycmd_column(column): def cmd(self, pkgpath, pkgpath_id, args): value = args with self._updates(1): self._sql('UPDATE pkgpath SET %s = ? WHERE id = ?' % (column,), (value, pkgpath_id)) cmd.__name__ = 'summarycmd_column(%s)' % (column,) return cmd def _summarycmd_platform(column): def cmd(self, pkgpath, pkgpath_id, args): platform = args if platform != 'any': with self._updates(1): self._sql( 'UPDATE pkgpath SET %s = ? WHERE id = ?' % (column,), (platform, pkgpath_id)) cmd.__name__ = 'summarycmd_platform(%s)' % (column,) return cmd def _summarycmd_descr(self, pkgpath, pkgpath_id, args): descr_src = args # Note: DESCR_SRC is relative to PKGSRCDIR, not PKGDIR; go # figure. src_id = self._src_id(descr_src) with self._updates(1): self._sql('UPDATE pkgpath SET descr_src_id = ? WHERE id = ?', (src_id, pkgpath_id)) _SUMMARYCMDTAB = { 'depends': _summarycmd_depends('run'), 'build_depends': _summarycmd_depends('build'), 'bootstrap_depends': _summarycmd_depends('boot'), 'tool_depends': _summarycmd_depends('tool'), 'conflicts': _summarycmd_conflicts, # This is the pkgname, which we get through pbulk-index. #'index': _summarycmd_column('index'), 'homepage': _summarycmd_column('homepage'), 'comment': _summarycmd_column('comment'), 'license': _summarycmd_column('license'), 'onlyfor': _summarycmd_platform('onlyfor'), 'notfor': _summarycmd_platform('notfor'), 'owner': _summarycmd_column('owner'), 'maintainer': _summarycmd_column('maintainer'), 'descr': _summarycmd_descr, # XXX Should include this only if it is distinct. 'prefix': _summarycmd_column('prefix'), } def _process_pbulk_index(self, pkgpath, pkgpath_id, pbulk_index): for pkgname, pkgbuild in pbulk_index: dash = pkgname.rfind('-') if dash == -1: self._err('%s: invalid pkgname: %s', pkgpath, pkgname) pkgbase = pkgname version = '-1' else: pkgbase = pkgname[:dash] version = pkgname[dash+1:] self._sql(''' INSERT INTO pkgbuild (pkgbase, version, pkgpath_id) VALUES (?, ?, ?) ''', (pkgbase, version, pkgpath_id)) pkgbuild_id = self._db.last_insert_rowid() for key, val in pkgbuild: if key in self._PBULKCMDTAB: self._dbg(DBG_SUMMARY, 'pkgbuild %s pbulk-index %s=%s', pkgname, key, val) self._PBULKCMDTAB[key](self, pkgname, pkgbuild_id, val) else: self._dbg(DBG_SUMMARY, 'pkgbuild %s ignore %s=%s', pkgname, key, val) def _pbulkcmd_depends(deptype): assert deptype is None def cmd(self, pkgname, pkgbuild_id, val): for dep in val.split(): depcat, depname, depversion = self._parsedep(dep) if depcat is None: self._err('%s: invalid dependency: %s', pkgname, dep) continue self._sql(''' INSERT OR IGNORE INTO pkgbuild_depends (pkgbuild_id, depcat, depname, depversion) VALUES (?, ?, ?, ?) ''', (pkgbuild_id, depcat, depname, depversion)) cmd.__name__ = 'pbulkcmd_depends(%s)' % (deptype,) return cmd def _pbulkcmd_column(column, strip=None): def cmd(self, pkgname, pkgbuild_id, val): with self._updates(1): self._sql( 'UPDATE pkgbuild SET %s = ? WHERE id = ?' % (column,), (val.strip() if strip else val, pkgbuild_id)) cmd.__name__ = 'pbulkcmd_column(%s)' % (column,) return cmd def _pbulkcmd_categories(self, pkgname, pkgbuild_id, val): pass def _pbulkcmd_bootstrap_pkg(self, pkgname, pkgbuild_id, val): with self._updates(1): self._sql('UPDATE pkgbuild SET bootstrap_pkg = ? WHERE id = ?', (val == 'yes', pkgbuild_id)) _PBULKCMDTAB = { 'ALL_DEPENDS': _pbulkcmd_depends(None), 'PKG_SKIP_REASON': _pbulkcmd_column('pkg_skip_reason'), 'PKG_FAIL_REASON': _pbulkcmd_column('pkg_fail_reason'), 'NO_BIN_ON_FTP': _pbulkcmd_column('no_bin_on_ftp'), 'RESTRICTED': _pbulkcmd_column('restricted'), 'CATEGORIES': _pbulkcmd_categories, 'MAINTAINER': _pbulkcmd_column('maintainer'), 'USE_DESTDIR': _pbulkcmd_column('use_destdir'), 'BOOTSTRAP_PKG': _pbulkcmd_bootstrap_pkg, 'USERGROUP_PHASE': _pbulkcmd_column('usergroup_phase'), 'MULTI_VERSION': _pbulkcmd_column('multi_version', strip=True), } @contextlib.contextmanager def _bmake_output(self, args, wd): # Open /dev/null for stdin. with open_fd(os.devnull, os.O_RDONLY) as devnull_read: popen = subprocess.Popen( [self._prog_bmake] + args, stdin=devnull_read, stdout=subprocess.PIPE, stderr=self._errout, close_fds=True, cwd=wd) try: yield popen.stdout except: popen.kill() raise finally: status = popen.wait() if status: self._err('bmake failed with status %d', status) # XXX skip dumping a python stack trace raise Exception('bmake') def _dbg(self, flags, message, *args): if self._dbgout and self._dbgflags & flags: self._dbgout.write((message + '\n') % args) def _msg(self, message, *args): if self._msgout: self._msgout.write((message + '\n') % args) def _err(self, message, *args): self._errout.write((message + '\n') % args) @contextlib.contextmanager def _transaction(self): with self._lock: with transaction(self._db): yield def _sql(self, q, *p): cursor = self._db.cursor() if p: self._dbg(DBG_SQL, 'execute %r %r', q, p[0]) else: self._dbg(DBG_SQL, 'execute %r', q) cursor.execute(q, *p) if self._db.changes(): self._dbg(DBG_SQL, 'changed %d row%s', self._db.changes(), '' if self._db.changes() == 1 else 's') return cursor def _sql1(self, q, *p, **k): return fetchvalue(self._sql(q, *p), **k) @contextlib.contextmanager def _updates(self, n): before = self._db.totalchanges() yield after = self._db.totalchanges() assert n == after - before if __name__ == '__main__': import getopt import signal def interrupt(signum, _frame): sys.exit(-signum) signal.signal(signal.SIGINT, interrupt) signal.signal(signal.SIGTERM, interrupt) progname = sys.argv[0] progname_slash = progname.rfind('/') if progname_slash != -1: progname = progname[progname_slash+1:] def usage(out): out.write( 'Usage: %s [-P ] [-d ] [-j ] [-m ]' ' \n' % (progname,)) try: opts, args = getopt.getopt(sys.argv[1:], 'P:d:j:hm:nq', ['help']) except getopt.GetoptError as e: sys.stderr.write('%s\n' % (str(e),)) usage(sys.stderr) sys.exit(1) arg_debug = 0 arg_index = None arg_make = None arg_njobs = None arg_pkgsrcdir = None arg_quiet = None arg_dry = None errors = [] for o, a in opts: if o == '-P': if arg_pkgsrcdir is not None: errors.append('duplicate -P\n') continue arg_pkgsrcdir = a elif o == '-d': for flag in a: if flag in DBGFLAGS: arg_debug |= DBGFLAGS[flag] else: errors.append('unknown -d flag: %s\n' % (flag,)) elif o in ('-h', '--help'): usage(sys.stdout) sys.exit(0) elif o == '-j': if arg_njobs is not None: errors.append('duplicate -j\n') continue try: arg_njobs = int(a) except ValueError: errors.append('invalid -j njobs\n') continue elif o == '-m': if arg_make is not None: errors.append('duplicate -m\n') continue arg_make = a elif o == '-n': arg_dry = True elif o == '-q': arg_quiet = True else: assert False, 'invalid option' if len(args) < 1: errors.append('missing argument for path to index file\n') elif 1 < len(args): errors.append('excess arguments') else: arg_index = os.path.abspath(args[0]) if errors: for error in errors: sys.stderr.write(error) usage(sys.stderr) sys.exit(1) assert arg_index is not None if arg_make is None: arg_make = 'bmake' if arg_njobs is None: arg_njobs = 1 if arg_pkgsrcdir is None: arg_pkgsrcdir = os.getcwd() if arg_quiet is None: arg_quiet = False try: os.chdir(arg_pkgsrcdir) indexer = Indexer( dbpath=arg_index, pkgsrcdir=arg_pkgsrcdir, dbgout=sys.stderr if arg_debug else None, dbgflags=arg_debug, msgout=None if arg_quiet else sys.stdout, errout=sys.stderr, prog_bmake=arg_make, njobs=arg_njobs, dry_run=arg_dry) indexer.db_schema() indexer.db_check() indexer.index() indexer.db_check() for (pkgpath, deptype, deppath) in indexer.orphan_dependencies(): sys.stderr.write( 'package %s %s-depends on unknown package: %s\n' % (pkgpath, deptype, deppath)) for (name, cats) in indexer.duplicate_pkgpaths(): sys.stderr.write( 'package name %s duplicated in categories: %s\n' % (name, ' '.join(cats))) except KeyboardInterrupt: print(traceback.format_exc())