github.com/anishathalye/periscope@v0.3.5/internal/db/db.go (about)

     1  package db
     2  
     3  import (
     4  	"github.com/anishathalye/periscope/internal/herror"
     5  
     6  	"bytes"
     7  	"database/sql"
     8  	"errors"
     9  	"fmt"
    10  	"log"
    11  	"math"
    12  	"path/filepath"
    13  	"sort"
    14  	"strconv"
    15  	"sync/atomic"
    16  
    17  	_ "github.com/mattn/go-sqlite3"
    18  )
    19  
    20  const versionKey = "version"
    21  const version = 3
    22  
    23  type FileInfo struct {
    24  	Path      string
    25  	Size      int64
    26  	ShortHash []byte
    27  	FullHash  []byte
    28  }
    29  
    30  type DuplicateSet []FileInfo
    31  
    32  type fileInfosOrdering []FileInfo
    33  
    34  func (a fileInfosOrdering) Len() int { return len(a) }
    35  func (a fileInfosOrdering) Less(i, j int) bool {
    36  	if a[i].Size > a[j].Size {
    37  		return true
    38  	} else if a[i].Size < a[j].Size {
    39  		return false
    40  	}
    41  	return a[i].Path < a[j].Path
    42  }
    43  func (a fileInfosOrdering) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
    44  
    45  type DuplicateInfo struct {
    46  	Path     string
    47  	FullHash []byte
    48  	Count    int64
    49  }
    50  
    51  type duplicateInfoByPath []DuplicateInfo
    52  
    53  func (a duplicateInfoByPath) Len() int           { return len(a) }
    54  func (a duplicateInfoByPath) Less(i, j int) bool { return a[i].Path < a[j].Path }
    55  func (a duplicateInfoByPath) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
    56  
    57  type InfoSummary struct {
    58  	Files     int64
    59  	Unique    int64
    60  	Duplicate int64
    61  	Overhead  int64
    62  }
    63  
    64  // A database session, or a transaction.
    65  //
    66  // This is a sort of weird implementation, but it makes the
    67  // interface/implementation convenient. The same type exposes a bunch of
    68  // methods that operate on the database, and the object and methods are the
    69  // same regardless of whether the operations are done within a transaction.
    70  //
    71  // Calling Begin() returns a Session that is a transaction, and calling
    72  // Commit() on the resultant session (transaction) commits the transaction.
    73  //
    74  // The db field is non-nil for a new session. For an open transaction, db is
    75  // nil and tx is non-nil. Once Commit() is called on the transaction, both the
    76  // db and tx are nil (and any method calls on this object will fail).
    77  type Session struct {
    78  	db *sql.DB
    79  	tx *sql.Tx
    80  }
    81  
    82  var inMemoryDbCtr int64 = 0
    83  
    84  func NewInMemory() (*Session, herror.Interface) {
    85  	// https://www.sqlite.org/inmemorydb.html#sharedmemdb
    86  	//
    87  	// We need distinct in-memory databases (for separate tests),
    88  	// but each in-memory database should support multiple connections
    89  	ctr := atomic.LoadInt64(&inMemoryDbCtr)
    90  	atomic.StoreInt64(&inMemoryDbCtr, ctr+1)
    91  	return New(fmt.Sprintf("file:memdb%d?mode=memory&cache=shared", ctr), true)
    92  }
    93  
    94  func New(dataSourceName string, debug bool) (*Session, herror.Interface) {
    95  	db, err := sql.Open("sqlite3", dataSourceName)
    96  	if err != nil {
    97  		return nil, herror.Internal(err, "")
    98  	}
    99  	// execute dummy statement to catch problems with db access
   100  	_, err = db.Exec("")
   101  	if err != nil {
   102  		return nil, herror.Unlikely(err, fmt.Sprintf("unable to access database at '%s'", dataSourceName), `
   103  Ensure that the directory is writable, and if the database file already exists, ensure it is readable and writable.
   104  		`)
   105  	}
   106  	// set up pragmas
   107  	if debug {
   108  		// good sanity check but slows things down, especially the gc in RemoveDir()
   109  		_, err = db.Exec("PRAGMA foreign_keys = ON")
   110  	} else {
   111  		_, err = db.Exec("PRAGMA foreign_keys = OFF")
   112  	}
   113  	if err != nil {
   114  		return nil, herror.Internal(err, "")
   115  	}
   116  	_, err = db.Exec("PRAGMA cache_size = -500000") // 500 MB
   117  	if err != nil {
   118  		return nil, herror.Internal(err, "")
   119  	}
   120  
   121  	s := &Session{db: db}
   122  	herr := s.checkVersion()
   123  	if herr != nil {
   124  		return nil, herr
   125  	}
   126  	err = s.initSchema()
   127  	if err != nil {
   128  		return nil, herror.Internal(err, "")
   129  	}
   130  	return s, nil
   131  }
   132  
   133  func (s *Session) checkVersion() herror.Interface {
   134  	// ensure metadata table exists
   135  	_, err := s.db.Exec(`
   136  	CREATE TABLE IF NOT EXISTS meta
   137  	(
   138  		key   TEXT UNIQUE NOT NULL,
   139  		value BLOB NOT NULL
   140  	)
   141  	`)
   142  	if err != nil {
   143  		return herror.Internal(err, "")
   144  	}
   145  	row := s.db.QueryRow("SELECT value FROM meta WHERE key = ?", versionKey)
   146  	var dbVersion string
   147  	err = row.Scan(&dbVersion)
   148  	if err == sql.ErrNoRows {
   149  		// okay, we will initialize version
   150  		_, err = s.db.Exec("INSERT INTO meta (key, value) VALUES (?, ?)", versionKey, strconv.Itoa(version))
   151  		if err != nil {
   152  			return herror.Internal(err, "")
   153  		}
   154  		return nil
   155  	}
   156  	// DB has a version, make sure it's the current version
   157  	dbVersionInt, err := strconv.ParseInt(dbVersion, 10, 0)
   158  	if err != nil || dbVersionInt != version {
   159  		return herror.Unlikely(nil, fmt.Sprintf("database version mismatch: expected %d, got %s", version, dbVersion), `
   160  This database was likely produced by an incompatible version of Periscope. Either use a compatible version of Periscope, or delete the database (by running 'psc finish') and try again.
   161  		`)
   162  	}
   163  	// correct version
   164  	return nil
   165  }
   166  
   167  func (s *Session) initSchema() error {
   168  	// only called in New, so db is non-null
   169  	_, err := s.db.Exec(`
   170  	CREATE TABLE IF NOT EXISTS directory
   171  	(
   172  		id     INTEGER PRIMARY KEY NOT NULL,
   173  		name   TEXT NOT NULL,
   174  		parent INTEGER NULL,
   175  		FOREIGN KEY(parent) REFERENCES directory(id),
   176  		UNIQUE(name, parent)
   177  	)
   178  	`)
   179  	if err != nil {
   180  		return err
   181  	}
   182  	_, err = s.db.Exec(`
   183  	CREATE TABLE IF NOT EXISTS file_info
   184  	(
   185  		id         INTEGER PRIMARY KEY NOT NULL,
   186  		directory  INTEGER NOT NULL,
   187  		filename   TEXT NOT NULL,
   188  		size       INTEGER NOT NULL,
   189  		short_hash BLOB NULL,
   190  		full_hash  BLOB NULL,
   191  		FOREIGN KEY(directory) REFERENCES directory(id),
   192  		UNIQUE(directory, filename)
   193  	)
   194  	`)
   195  	return err
   196  }
   197  
   198  func (s *Session) Begin() (*Session, herror.Interface) {
   199  	if s.tx != nil {
   200  		return nil, herror.Internal(nil, "cannot Begin(): already in a transaction")
   201  	}
   202  	if s.db == nil {
   203  		return nil, herror.Internal(nil, "cannot Begin(): finished transaction")
   204  	}
   205  	tx, err := s.db.Begin()
   206  	if err != nil {
   207  		return nil, herror.Internal(err, "")
   208  	}
   209  	return &Session{db: nil, tx: tx}, nil
   210  }
   211  
   212  func (s *Session) Commit() herror.Interface {
   213  	if s.tx == nil {
   214  		return herror.Internal(nil, "Commit(): not in a running transaction")
   215  	}
   216  	err := s.tx.Commit()
   217  	if err != nil {
   218  		return herror.Internal(err, "")
   219  	}
   220  	s.tx = nil
   221  	return nil
   222  }
   223  
   224  func (s *Session) query(query string, args ...interface{}) (*sql.Rows, error) {
   225  	if s.tx != nil {
   226  		return s.tx.Query(query, args...)
   227  	}
   228  	if s.db == nil {
   229  		return nil, herror.Internal(nil, "transaction is finished")
   230  	}
   231  	return s.db.Query(query, args...)
   232  }
   233  
   234  func (s *Session) queryRow(query string, args ...interface{}) (*sql.Row, herror.Interface) {
   235  	if s.tx != nil {
   236  		return s.tx.QueryRow(query, args...), nil
   237  	}
   238  	if s.db == nil {
   239  		return nil, herror.Internal(nil, "transaction is finished")
   240  	}
   241  	return s.db.QueryRow(query, args...), nil
   242  }
   243  
   244  func (s *Session) exec(query string, args ...interface{}) (sql.Result, error) {
   245  	if s.tx != nil {
   246  		return s.tx.Exec(query, args...)
   247  	}
   248  	if s.db == nil {
   249  		return nil, herror.Internal(nil, "transaction is finished")
   250  	}
   251  	return s.db.Exec(query, args...)
   252  }
   253  
   254  func (s *Session) pathToDirectoryId(path string, create bool) (int64, error) {
   255  	if path == "" {
   256  		return 0, errors.New("path is empty")
   257  	}
   258  	path = filepath.Clean(path) // remove extra "/" at the end, etc.
   259  	var elems []string
   260  	var base string
   261  	for base != "/" {
   262  		base = filepath.Base(path)
   263  		elems = append(elems, base)
   264  		path = filepath.Dir(path)
   265  	}
   266  	id := int64(-1)
   267  	for i := len(elems) - 1; i >= 0; i-- {
   268  		var row *sql.Row
   269  		var err error
   270  		if id == -1 {
   271  			row, err = s.queryRow(`
   272  			SELECT id
   273  			FROM directory
   274  			WHERE name = ?
   275  				AND parent IS NULL
   276  			`, elems[i])
   277  		} else {
   278  			row, err = s.queryRow(`
   279  			SELECT id
   280  			FROM directory
   281  			WHERE name = ?
   282  				AND parent = ?
   283  			`, elems[i], id)
   284  		}
   285  		if err != nil {
   286  			return 0, err
   287  		}
   288  		err = row.Scan(&id)
   289  		if err == sql.ErrNoRows && create {
   290  			// need to create it
   291  			var result sql.Result
   292  			if id == -1 {
   293  				result, err = s.exec(`
   294  				INSERT INTO directory (name, parent) VALUES (?, NULL)
   295  				`, elems[i])
   296  			} else {
   297  				result, err = s.exec(`
   298  				INSERT INTO directory (name, parent) VALUES (?, ?)
   299  				`, elems[i], id)
   300  			}
   301  			if err != nil {
   302  				return 0, err
   303  			}
   304  			id, err = result.LastInsertId()
   305  			if err != nil {
   306  				return 0, err
   307  			}
   308  		} else if err != nil {
   309  			return 0, err
   310  		}
   311  	}
   312  	return id, nil
   313  }
   314  
   315  func (s *Session) directoryIdToPath(id int64) (string, error) {
   316  	rows, err := s.query(`
   317  	WITH RECURSIVE sup_directory (id, name, parent, level) AS (
   318  		SELECT id, name, parent, 1 FROM directory WHERE id = ?
   319  		UNION ALL
   320  		SELECT d.id, d.name, d.parent, level+1
   321  		FROM directory d, sup_directory sd
   322  		WHERE d.id = sd.parent
   323  	)
   324  	SELECT name, (SELECT max(level) FROM sup_directory) - level AS distance
   325  	FROM sup_directory
   326  	ORDER BY distance
   327  	`, id)
   328  	if err != nil {
   329  		return "", err
   330  	}
   331  	defer rows.Close()
   332  	var path string
   333  	for rows.Next() {
   334  		var name string
   335  		var level int64
   336  		if err = rows.Scan(&name, &level); err != nil {
   337  			return "", err
   338  		}
   339  		if path == "" {
   340  			path = name
   341  		} else {
   342  			path = filepath.Join(path, name)
   343  		}
   344  	}
   345  	return path, nil
   346  }
   347  
   348  func (s *Session) Add(info FileInfo) herror.Interface {
   349  	dirname := filepath.Dir(info.Path)
   350  	filename := filepath.Base(info.Path)
   351  	dirid, err := s.pathToDirectoryId(dirname, true)
   352  	if err != nil {
   353  		return herror.Internal(err, "")
   354  	}
   355  	if _, err := s.exec(`
   356  	REPLACE INTO file_info (directory, filename, size, short_hash, full_hash)
   357  	VALUES (?, ?, ?, ?, ?)
   358  	`, dirid, filename, info.Size, info.ShortHash, info.FullHash); err != nil {
   359  		return herror.Internal(err, "")
   360  	}
   361  	return nil
   362  }
   363  
   364  // Returns all infos in the database (regardless of whether they have
   365  // duplicates).
   366  func (s *Session) AllInfosC() (<-chan FileInfo, herror.Interface) {
   367  	rows, err := s.query(`
   368  	SELECT directory, filename, size, short_hash, full_hash
   369  	FROM file_info`)
   370  	if err != nil {
   371  		return nil, herror.Internal(err, "")
   372  	}
   373  	results := make(chan FileInfo)
   374  	go func() {
   375  		defer rows.Close()
   376  		for rows.Next() {
   377  			var dirid int64
   378  			var filename string
   379  			var info FileInfo
   380  			if err := rows.Scan(&dirid, &filename, &info.Size, &info.ShortHash, &info.FullHash); err != nil {
   381  				// similar issue as below in AllDuplicatesC: how to report this?
   382  				log.Printf("failure while scanning row: %s", err)
   383  				continue
   384  			}
   385  			dirname, err := s.directoryIdToPath(dirid)
   386  			if err != nil {
   387  				log.Printf("failure while resolving directory name: %s", err)
   388  				continue
   389  			}
   390  			info.Path = filepath.Join(dirname, filename)
   391  			results <- info
   392  		}
   393  		close(results)
   394  	}()
   395  	return results, nil
   396  }
   397  
   398  func (s *Session) AllInfos() ([]FileInfo, herror.Interface) {
   399  	var r []FileInfo
   400  	c, err := s.AllInfosC()
   401  	if err != nil {
   402  		return nil, err
   403  	}
   404  	for i := range c {
   405  		r = append(r, i)
   406  	}
   407  	sort.Sort(fileInfosOrdering(r))
   408  	return r, nil
   409  }
   410  
   411  func (s *Session) CreateIndexes() herror.Interface {
   412  	// ensuring that an index on full_hash exists makes a huge difference
   413  	// in performance for commands like ls, because we use this for finding
   414  	// duplicates
   415  	_, err := s.exec("CREATE INDEX IF NOT EXISTS idx_hash ON file_info (full_hash)")
   416  	if err != nil {
   417  		return herror.Internal(err, "")
   418  	}
   419  	// makes a big difference when we are looking up by size (relevant when
   420  	// scanning)
   421  	_, err = s.exec("CREATE INDEX IF NOT EXISTS idx_size ON file_info (size)")
   422  	if err != nil {
   423  		return herror.Internal(err, "")
   424  	}
   425  	// for looking up files by directory/filename
   426  	_, err = s.exec("CREATE INDEX IF NOT EXISTS idx_directory_filename ON file_info (directory, filename)")
   427  	if err != nil {
   428  		return herror.Internal(err, "")
   429  	}
   430  	// for recursive lookup
   431  	_, err = s.exec("CREATE INDEX IF NOT EXISTS idx_name_parent ON directory (name, parent)")
   432  	if err != nil {
   433  		return herror.Internal(err, "")
   434  	}
   435  	// indexes on foreign keys
   436  	_, err = s.exec("CREATE INDEX IF NOT EXISTS idx_directory ON file_info (directory)")
   437  	if err != nil {
   438  		return herror.Internal(err, "")
   439  	}
   440  	_, err = s.exec("CREATE INDEX IF NOT EXISTS idx_parent ON directory (parent)")
   441  	if err != nil {
   442  		return herror.Internal(err, "")
   443  	}
   444  	return nil
   445  }
   446  
   447  // Returns all known duplicates in the database.
   448  //
   449  // These are necessarily FileInfos with the FullHash field filled out. Each
   450  // DuplicateSet that is returned always has > 1 element (i.e. it only includes
   451  // duplicates, not infos where we happen to know the full hash).
   452  //
   453  // path is optional; if "", then all duplicates are returned, otherwise only
   454  // ones with the given directory prefix
   455  func (s *Session) AllDuplicatesC(path string) (<-chan DuplicateSet, herror.Interface) {
   456  	results := make(chan DuplicateSet)
   457  	dirid := int64(-1)
   458  	var err error
   459  	if path != "" {
   460  		dirid, err = s.pathToDirectoryId(path, false)
   461  		if err == sql.ErrNoRows {
   462  			close(results)
   463  			return results, nil
   464  		} else if err != nil {
   465  			return nil, herror.Internal(err, "")
   466  		}
   467  	}
   468  	var rows *sql.Rows
   469  	if dirid == -1 {
   470  		rows, err = s.query(`
   471  		SELECT directory, filename, size, short_hash, full_hash
   472  		FROM file_info
   473  		WHERE full_hash IS NOT NULL
   474  		ORDER BY size DESC, full_hash`)
   475  	} else {
   476  		rows, err = s.query(`
   477  		WITH dirs AS
   478  		(
   479  			WITH RECURSIVE sub_directory (id, parent) AS (
   480  				SELECT id, parent FROM directory WHERE id = ?
   481  				UNION ALL
   482  				SELECT d.id, d.parent
   483  				FROM directory d, sub_directory sd
   484  				WHERE d.parent = sd.id
   485  			)
   486  			SELECT id FROM sub_directory
   487  		),
   488  		matching_hashes AS
   489  		(
   490  			SELECT full_hash FROM file_info WHERE directory IN dirs AND full_hash IS NOT NULL
   491  		)
   492  		SELECT directory, filename, size, short_hash, full_hash
   493  		FROM file_info
   494  		WHERE full_hash IN matching_hashes
   495  		ORDER BY size DESC, full_hash`, dirid)
   496  	}
   497  	if err != nil {
   498  		return nil, herror.Internal(err, "")
   499  	}
   500  	go func() {
   501  		defer rows.Close()
   502  		var set DuplicateSet
   503  		var prevHash []byte
   504  		for rows.Next() {
   505  			var dirid int64
   506  			var filename string
   507  			var info FileInfo
   508  			if err := rows.Scan(&dirid, &filename, &info.Size, &info.ShortHash, &info.FullHash); err != nil {
   509  				// how should we handle this error that happens in its own goroutine?
   510  				// give up on this row?
   511  				log.Printf("failure while scanning row: %s", err)
   512  				continue
   513  			}
   514  			dirname, err := s.directoryIdToPath(dirid)
   515  			if err != nil {
   516  				log.Printf("failure while resolving directory name: %s", err)
   517  				continue
   518  			}
   519  			info.Path = filepath.Join(dirname, filename)
   520  			if !bytes.Equal(info.FullHash, prevHash) {
   521  				if len(set) > 1 {
   522  					// note: set may have singletons, we don't remove info about files with single matches
   523  					sort.Sort(fileInfosOrdering(set))
   524  					results <- set
   525  				}
   526  				set = nil
   527  			}
   528  			prevHash = info.FullHash
   529  			set = append(set, info)
   530  		}
   531  		// will usually be some infos left over, if the last file size/hash has duplicates
   532  		if len(set) > 1 {
   533  			sort.Sort(fileInfosOrdering(set))
   534  			results <- set
   535  		}
   536  		close(results)
   537  	}()
   538  	return results, nil
   539  }
   540  
   541  func (s *Session) AllDuplicates(path string) ([]DuplicateSet, herror.Interface) {
   542  	var r []DuplicateSet
   543  	c, err := s.AllDuplicatesC(path)
   544  	if err != nil {
   545  		return nil, err
   546  	}
   547  	for d := range c {
   548  		r = append(r, d)
   549  	}
   550  	return r, nil
   551  }
   552  
   553  func (s *Session) Summary() (InfoSummary, herror.Interface) {
   554  	row, err := s.queryRow("SELECT COUNT(*) FROM file_info")
   555  	if err != nil {
   556  		return InfoSummary{}, err
   557  	}
   558  	var files int64
   559  	if err := row.Scan(&files); err != nil {
   560  		return InfoSummary{}, herror.Internal(err, "")
   561  	}
   562  	row, err = s.queryRow(`
   563  	WITH sets AS
   564  	(
   565  		SELECT COUNT(*) AS cnt, size
   566  		FROM file_info
   567  		GROUP BY full_hash
   568  		HAVING COUNT(full_hash) > 1
   569  	)
   570  	SELECT COUNT(*), SUM(cnt), SUM((cnt-1)*size) from sets
   571  	`)
   572  	if err != nil {
   573  		return InfoSummary{}, err
   574  	}
   575  	var uniqueWithDuplicates int64
   576  	var filesWithDuplicates, overhead sql.NullInt64
   577  	if err := row.Scan(&uniqueWithDuplicates, &filesWithDuplicates, &overhead); err != nil {
   578  		return InfoSummary{}, herror.Internal(err, "")
   579  	}
   580  	duplicate := filesWithDuplicates.Int64 - uniqueWithDuplicates
   581  	return InfoSummary{
   582  		Files:     files,
   583  		Unique:    files - duplicate,
   584  		Duplicate: duplicate,
   585  		Overhead:  overhead.Int64,
   586  	}, nil
   587  }
   588  
   589  // Returns info for everything matching the given file.
   590  //
   591  // Returns [] if there isn't a matching file in the database. If the file
   592  // exists in the database, that file is returned first.
   593  func (s *Session) Lookup(path string) (DuplicateSet, herror.Interface) {
   594  	dirname := filepath.Dir(path)
   595  	filename := filepath.Base(path)
   596  	var set DuplicateSet
   597  	dirid, err := s.pathToDirectoryId(dirname, false)
   598  	if err == sql.ErrNoRows {
   599  		return set, nil // directory not known => empty
   600  	} else if err != nil {
   601  		return nil, herror.Internal(err, "")
   602  	}
   603  	row, herr := s.queryRow(`
   604  	SELECT id, size, short_hash, full_hash
   605  	FROM file_info
   606  	WHERE directory = ? AND filename = ?
   607  	`, dirid, filename)
   608  	if herr != nil {
   609  		return nil, herr
   610  	}
   611  	var id int
   612  	var info FileInfo
   613  	err = row.Scan(&id, &info.Size, &info.ShortHash, &info.FullHash)
   614  	if err == sql.ErrNoRows {
   615  		return set, nil // empty
   616  	} else if err != nil {
   617  		return nil, herror.Internal(err, "")
   618  	}
   619  	info.Path = filepath.Join(dirname, filename)
   620  	if info.FullHash == nil {
   621  		// no known duplicates
   622  		set = append(set, info)
   623  		return set, nil
   624  	}
   625  	// get all others
   626  	rows, err := s.query(`
   627  	SELECT directory, filename, size, short_hash, full_hash
   628  	FROM file_info
   629  	WHERE full_hash = ? AND id != ?`, info.FullHash, id)
   630  	if err != nil {
   631  		return nil, herror.Internal(err, "")
   632  	}
   633  	defer rows.Close()
   634  	for rows.Next() {
   635  		var info FileInfo
   636  		if err := rows.Scan(&dirid, &filename, &info.Size, &info.ShortHash, &info.FullHash); err != nil {
   637  			return nil, herror.Internal(err, "")
   638  		}
   639  		dirname, err := s.directoryIdToPath(dirid)
   640  		if err != nil {
   641  			return nil, herror.Internal(err, "")
   642  		}
   643  		info.Path = filepath.Join(dirname, filename)
   644  		set = append(set, info)
   645  	}
   646  	sort.Sort(fileInfosOrdering(set))
   647  	set = append(DuplicateSet{info}, set...) // so the given info is first
   648  	return set, nil
   649  }
   650  
   651  // Returns all the infos with the given size.
   652  //
   653  // This includes all infos, even ones where the short hash or full hash is not known.
   654  func (s *Session) InfosBySize(size int64) ([]FileInfo, herror.Interface) {
   655  	rows, err := s.query(`
   656  	SELECT directory, filename, size, short_hash, full_hash
   657  	FROM file_info
   658  	WHERE size = ?
   659  	`, size)
   660  	if err != nil {
   661  		return nil, herror.Internal(err, "")
   662  	}
   663  	defer rows.Close()
   664  	var results []FileInfo
   665  	for rows.Next() {
   666  		var dirid int64
   667  		var filename string
   668  		var info FileInfo
   669  		if err := rows.Scan(&dirid, &filename, &info.Size, &info.ShortHash, &info.FullHash); err != nil {
   670  			return nil, herror.Internal(err, "")
   671  		}
   672  		dirname, err := s.directoryIdToPath(dirid)
   673  		if err != nil {
   674  			return nil, herror.Internal(err, "")
   675  		}
   676  		info.Path = filepath.Join(dirname, filename)
   677  		results = append(results, info)
   678  	}
   679  	return results, nil
   680  }
   681  
   682  // Returns all duplicate sets (size > 1) where at least one file is contained under the given path.
   683  func (s *Session) LookupAllC(path string, includeHidden bool) (<-chan DuplicateInfo, herror.Interface) {
   684  	results := make(chan DuplicateInfo)
   685  	dirid, err := s.pathToDirectoryId(path, false)
   686  	if err == sql.ErrNoRows {
   687  		close(results)
   688  		return results, nil
   689  	} else if err != nil {
   690  		return nil, herror.Internal(err, "")
   691  	}
   692  	var rows *sql.Rows
   693  	if includeHidden {
   694  		rows, err = s.query(`
   695  		WITH dirs AS
   696  		(
   697  			WITH RECURSIVE sub_directory (id, parent) AS (
   698  				SELECT id, parent FROM directory WHERE id = ?
   699  				UNION ALL
   700  				SELECT d.id, d.parent
   701  				FROM directory d, sub_directory sd
   702  				WHERE d.parent = sd.id
   703  			)
   704  			SELECT id FROM sub_directory
   705  		)
   706  		SELECT a.directory, a.filename, a.full_hash, COUNT(b.id)
   707  		FROM file_info a, file_info b
   708  		WHERE a.full_hash IS NOT NULL
   709  			AND a.full_hash = b.full_hash
   710  			AND a.directory IN dirs
   711  		GROUP BY a.directory, a.filename
   712  		`, dirid)
   713  	} else {
   714  		rows, err = s.query(`
   715  		WITH dirs AS
   716  		(
   717  			WITH RECURSIVE sub_directory (id, parent) AS (
   718  				SELECT id, parent FROM directory WHERE id = ?
   719  				UNION ALL
   720  				SELECT d.id, d.parent
   721  				FROM directory d, sub_directory sd
   722  				WHERE d.parent = sd.id
   723  					AND SUBSTR(d.name, 1, 1) != '.'
   724  			)
   725  			SELECT id FROM sub_directory
   726  		)
   727  		SELECT a.directory, a.filename, a.full_hash, COUNT(b.id)
   728  		FROM file_info a, file_info b
   729  		WHERE a.full_hash IS NOT NULL
   730  			AND a.full_hash = b.full_hash
   731  			AND a.directory IN dirs
   732  			AND SUBSTR(a.filename, 1, 1) != '.'
   733  		GROUP BY a.directory, a.filename
   734  		`, dirid)
   735  	}
   736  	if err != nil {
   737  		return nil, herror.Internal(err, "")
   738  	}
   739  	go func() {
   740  		defer rows.Close()
   741  		for rows.Next() {
   742  			var dirid int64
   743  			var filename string
   744  			var fullHash []byte
   745  			var count int64
   746  			if err := rows.Scan(&dirid, &filename, &fullHash, &count); err != nil {
   747  				log.Printf("failure while scanning row: %s", err)
   748  				continue
   749  			}
   750  			dirname, err := s.directoryIdToPath(dirid)
   751  			if err != nil {
   752  				log.Printf("failure while resolving directory name: %s", err)
   753  				continue
   754  			}
   755  			path := filepath.Join(dirname, filename)
   756  			if count > 1 {
   757  				results <- DuplicateInfo{Path: path, FullHash: fullHash, Count: count}
   758  			}
   759  		}
   760  		close(results)
   761  	}()
   762  	return results, nil
   763  }
   764  
   765  func (s *Session) LookupAll(path string, includeHidden bool) ([]DuplicateInfo, herror.Interface) {
   766  	var r []DuplicateInfo
   767  	c, err := s.LookupAllC(path, includeHidden)
   768  	if err != nil {
   769  		return nil, err
   770  	}
   771  	for i := range c {
   772  		r = append(r, i)
   773  	}
   774  	sort.Sort(duplicateInfoByPath(r))
   775  	return r, nil
   776  }
   777  
   778  // Deletes a file with the given path from the database.
   779  func (s *Session) Remove(path string) herror.Interface {
   780  	dirname := filepath.Dir(path)
   781  	filename := filepath.Base(path)
   782  	dirid, err := s.pathToDirectoryId(dirname, true)
   783  	if err == sql.ErrNoRows {
   784  		return nil
   785  	} else if err != nil {
   786  		return herror.Internal(err, "")
   787  	}
   788  	_, err = s.exec(`
   789  	DELETE FROM file_info
   790  	WHERE directory = ? AND filename = ?`, dirid, filename)
   791  	if err != nil {
   792  		return herror.Internal(err, "")
   793  	}
   794  	// don't bother to delete orphaned directories here
   795  	return nil
   796  }
   797  
   798  // Deletes all files matching the given directory prefix from the database,
   799  // with sizes in the specified range.
   800  //
   801  // A max size of 0 is interpreted as infinity. This does not just match based
   802  // on prefix, it interprets the prefix as a directory, and only deletes files
   803  // under the given directory. This means that it won't accidentally match file
   804  // names (or other directory names) where the prefix is common, e.g. deleting
   805  // "/a" won't delete file "/aa" or contents under a directory "/aa".
   806  func (s *Session) RemoveDir(dir string, min, max int64) herror.Interface {
   807  	if max <= 0 {
   808  		max = math.MaxInt64
   809  	}
   810  	dirid, err := s.pathToDirectoryId(dir, false)
   811  	if err == sql.ErrNoRows {
   812  		return nil
   813  	} else if err != nil {
   814  		return herror.Internal(err, "")
   815  	}
   816  	if min == 0 && max == math.MaxInt64 {
   817  		// more efficient query
   818  		_, err = s.exec(`
   819  		WITH dirs AS
   820  		(
   821  			WITH RECURSIVE sub_directory (id, parent) AS (
   822  				SELECT id, parent FROM directory WHERE id = ?
   823  				UNION ALL
   824  				SELECT d.id, d.parent
   825  				FROM directory d, sub_directory sd
   826  				WHERE d.parent = sd.id
   827  			)
   828  			SELECT id FROM sub_directory
   829  		)
   830  		DELETE FROM file_info
   831  		WHERE directory IN dirs
   832  		`, dirid)
   833  	} else {
   834  		_, err = s.exec(`
   835  		WITH dirs AS
   836  		(
   837  			WITH RECURSIVE sub_directory (id, parent) AS (
   838  				SELECT id, parent FROM directory WHERE id = ?
   839  				UNION ALL
   840  				SELECT d.id, d.parent
   841  				FROM directory d, sub_directory sd
   842  				WHERE d.parent = sd.id
   843  			)
   844  			SELECT id FROM sub_directory
   845  		)
   846  		DELETE FROM file_info
   847  		WHERE directory IN dirs
   848  			AND size > ?
   849  			AND size <= ?
   850  		`, dirid, min, max)
   851  	}
   852  	if err != nil {
   853  		return herror.Internal(err, "")
   854  	}
   855  	// delete orphaned directories
   856  	_, err = s.exec(`
   857  	WITH reachable AS
   858  	(
   859  		WITH RECURSIVE sub_directory (id, parent) AS (
   860  			SELECT id, parent FROM directory WHERE id IN (SELECT DISTINCT directory FROM file_info)
   861  			UNION ALL
   862  			SELECT d.id, d.parent
   863  			FROM directory d, sub_directory sd
   864  			WHERE d.id = sd.parent
   865  		)
   866  		SELECT DISTINCT id
   867  		FROM sub_directory
   868  	)
   869  	DELETE FROM directory
   870  	WHERE id NOT IN reachable`)
   871  	if err != nil {
   872  		return herror.Internal(err, "")
   873  	}
   874  	return nil
   875  }