github.com/drellem2/pogo@v0.0.0-20240503070746-2c2b76da329a/internal/plugins/search/search_index.go (about)

     1  package search
     2  
     3  import (
     4  	"context"
     5  	"encoding/json"
     6  	"errors"
     7  	"io/ioutil"
     8  	"os"
     9  	"path/filepath"
    10  	"strconv"
    11  	"strings"
    12  	"time"
    13  
    14  	"github.com/sabhiram/go-gitignore"
    15  	"github.com/sourcegraph/zoekt"
    16  	"github.com/sourcegraph/zoekt/query"
    17  
    18  	pogoPlugin "github.com/drellem2/pogo/pkg/plugin"
    19  )
    20  
    21  const saveFileName = "search_index.json"
    22  const codeSearchIndexFileName = "code_search_index"
    23  const indexStartCapacity = 50
    24  const indexCacheMinutes = 24 * 60
    25  
    26  type PogoChunkMatch struct {
    27  	Line    uint32 `json:"line"`
    28  	Content string `json:"content"`
    29  }
    30  
    31  type PogoFileMatch struct {
    32  	Path    string           `json:"path"`
    33  	Matches []PogoChunkMatch `json:"matches"`
    34  }
    35  
    36  type SearchResults struct {
    37  	Files []PogoFileMatch `json:"files"`
    38  }
    39  
    40  type IndexedProject struct {
    41  	Root  string   `json:"root"`
    42  	Paths []string `json:"paths"`
    43  }
    44  
    45  /*
    46  *
    47  
    48  	Contains channels that can be written to in order to update the project.
    49  */
    50  type ProjectUpdater struct {
    51  	c        chan *IndexedProject
    52  	addFw    chan string
    53  	removeFw chan string
    54  	quit     chan bool
    55  	closed   bool
    56  }
    57  
    58  func absolute(path string) (string, error) {
    59  	str, err := filepath.Abs(path)
    60  	if err != nil {
    61  		return "", err
    62  	}
    63  	info, err2 := os.Lstat(path)
    64  	if err2 != nil {
    65  		return "", err2
    66  	}
    67  	if info.IsDir() {
    68  		return str + "/", nil
    69  	}
    70  	return str, nil
    71  }
    72  
    73  /*
    74  *
    75  
    76  	Returns some channels that can be written to in order to update the project.
    77  	Starts a goroutine that will read these channels.
    78  */
    79  func (g *BasicSearch) newProjectUpdater() *ProjectUpdater {
    80  	u := &ProjectUpdater{
    81  		c:        make(chan *IndexedProject),
    82  		addFw:    make(chan string),
    83  		removeFw: make(chan string),
    84  		quit:     make(chan bool),
    85  		closed:   false,
    86  	}
    87  	go g.write(u)
    88  	return u
    89  }
    90  
    91  func (g *BasicSearch) write(u *ProjectUpdater) {
    92  	for !u.closed {
    93  		func() {
    94  			select {
    95  			case proj := <-u.c:
    96  				g.projects[proj.Root] = *proj
    97  				g.serializeProjectIndex(proj)
    98  			case p := <-u.addFw:
    99  				if g.watcher == nil {
   100  					g.logger.Warn("watcher is nil")
   101  				}
   102  				w := g.watcher.Add(p)
   103  				if w != nil {
   104  					g.logger.Error("Error adding file watcher: %v", w)
   105  				}
   106  			case p := <-u.removeFw:
   107  				if g.watcher == nil {
   108  					g.logger.Warn("watcher is nil")
   109  				}
   110  				g.watcher.Remove(p)
   111  			case <-u.quit:
   112  				u.closed = true
   113  			}
   114  		}()
   115  	}
   116  }
   117  
   118  // Should only be called by index
   119  func (g *BasicSearch) indexRec(proj *IndexedProject, path string,
   120  	gitIgnore *ignore.GitIgnore, u *ProjectUpdater) error {
   121  	// First index all files in the project
   122  	file, err := os.Open(path)
   123  	if err != nil {
   124  		return err
   125  	}
   126  	defer file.Close()
   127  	dirnames, err := file.Readdirnames(0)
   128  	g.logger.Debug("Found dirs: ", dirnames)
   129  	if err != nil {
   130  		return err
   131  	}
   132  	if len(dirnames) == 0 {
   133  		return nil
   134  	}
   135  	files := make([]string, 0, len(dirnames)/2)
   136  	for _, subFile := range dirnames {
   137  		newPath := filepath.Join(path, subFile)
   138  		fileInfo, err := os.Lstat(newPath)
   139  		if err != nil {
   140  			g.logger.Warn(err.Error())
   141  			continue
   142  		}
   143  		// Remove projectRoot prefix from newPath
   144  		relativePath := strings.TrimPrefix(newPath, proj.Root)
   145  
   146  		if !gitIgnore.MatchesPath(relativePath) && subFile != ".git" && subFile != ".pogo" {
   147  			if fileInfo.IsDir() {
   148  				u.addFw <- newPath
   149  				err = g.indexRec(proj, newPath, gitIgnore, u)
   150  				if err != nil {
   151  					g.logger.Warn(err.Error())
   152  				}
   153  			} else {
   154  				files = append(files, relativePath)
   155  			}
   156  		}
   157  	}
   158  	proj.Paths = append(proj.Paths, files...)
   159  	return nil
   160  }
   161  
   162  // Try to index all files in the project, then create a code search index.
   163  // The first is table stakes - so we error on failure. If the second fails, we log it and return.
   164  func (g *BasicSearch) index(proj *IndexedProject, path string,
   165  	gitIgnore *ignore.GitIgnore) {
   166  
   167  	u := g.updater
   168  
   169  	err := g.indexRec(proj, path, gitIgnore, u)
   170  	if err != nil {
   171  		g.logger.Warn("Error indexing project: ", err.Error())
   172  		return
   173  	}
   174  	u.c <- proj
   175  }
   176  
   177  func (g *BasicSearch) ReIndex(path string) {
   178  	fileInfo, e := os.Lstat(path)
   179  	if e != nil {
   180  		g.logger.Error("Error getting path info: ", e)
   181  		return
   182  	}
   183  	if !fileInfo.IsDir() {
   184  		path = filepath.Dir(path)
   185  	}
   186  	g.logger.Info("Reindexing ", path)
   187  	go func() {
   188  		fullPath, err2 := absolute(path)
   189  		if err2 != nil {
   190  			g.logger.Error("Error getting absolute path", path)
   191  			return
   192  		}
   193  		for projectRoot, indexed := range g.projects {
   194  			if strings.HasPrefix(fullPath, projectRoot) {
   195  				/* Below is a golang idiom for removing
   196  				elements with prefix from the slice. We
   197  				want to remove all file watchers before
   198  				reindexing, so we only add back the files
   199  				that still exist. */
   200  				relativePath := strings.TrimPrefix(fullPath, projectRoot)
   201  				paths := indexed.Paths
   202  				paths2 := paths
   203  				paths = paths[:0]
   204  				u := g.updater
   205  				for _, p := range paths2 {
   206  					if !strings.HasPrefix(p, relativePath) {
   207  						paths = append(paths, p)
   208  					} else {
   209  						u.removeFw <- p
   210  					}
   211  				}
   212  				indexed.Paths = paths
   213  
   214  				gitIgnore, err := ParseGitIgnore(projectRoot)
   215  				if err != nil {
   216  					g.logger.Error("Error parsing gitignore %v", err)
   217  				}
   218  				g.index(&indexed, fullPath, gitIgnore)
   219  				break
   220  			}
   221  		}
   222  	}()
   223  }
   224  
   225  /*
   226  Even if this function encounters an error, it will always at least return a
   227  GitIgnore that matches nothing.
   228  */
   229  func ParseGitIgnore(path string) (*ignore.GitIgnore, error) {
   230  	// Read .gitignore if exists
   231  	ignorePath := filepath.Join(path, ".gitignore")
   232  	var err error
   233  	_, err = os.Lstat(ignorePath)
   234  	var gitIgnore *ignore.GitIgnore
   235  	if err != nil {
   236  		if errors.Is(err, os.ErrNotExist) {
   237  			err = nil
   238  		}
   239  		gitIgnore = ignore.CompileIgnoreLines("")
   240  	} else {
   241  		gitIgnore, err = ignore.CompileIgnoreFile(ignorePath)
   242  		if err != nil {
   243  			gitIgnore = ignore.CompileIgnoreLines("")
   244  		}
   245  	}
   246  	return gitIgnore, err
   247  }
   248  
   249  func (g *BasicSearch) deleteIndexFile(p *IndexedProject) error {
   250  	searchDir, err := p.makeSearchDir()
   251  	if err != nil {
   252  		g.logger.Error("Error making search dir: ", err)
   253  		return err
   254  	}
   255  	indexPath := filepath.Join(searchDir, codeSearchIndexFileName)
   256  	// First check if indexPath exists
   257  	_, err = os.Lstat(indexPath)
   258  	if err != nil {
   259  		if errors.Is(err, os.ErrNotExist) {
   260  			return nil
   261  		} else {
   262  			return err
   263  		}
   264  	}
   265  	return os.Remove(indexPath)
   266  }
   267  
   268  func (g *BasicSearch) getSearchFile(p *IndexedProject, filename string) (*os.File, error) {
   269  	path := p.Root
   270  	searchDir, err := p.makeSearchDir()
   271  	if err != nil {
   272  		g.logger.Error("Error making search dir: ", err)
   273  		return nil, err
   274  	}
   275  	indexPath := filepath.Join(searchDir, filename)
   276  	indexFile, err := os.OpenFile(indexPath, os.O_CREATE|os.O_WRONLY, 0600)
   277  	if err != nil {
   278  		g.logger.Error("Error opening index file ", path)
   279  		return nil, err
   280  	}
   281  	return indexFile, nil
   282  }
   283  
   284  func (g *BasicSearch) getIndexFile(p *IndexedProject) (*os.File, error) {
   285  	return g.getSearchFile(p, codeSearchIndexFileName)
   286  }
   287  
   288  func (g *BasicSearch) Index(req *pogoPlugin.IProcessProjectReq) {
   289  	path := (*req).Path()
   290  	p, ok := g.projects[path]
   291  	if ok && p.Paths != nil && len(p.Paths) > 0 {
   292  		g.logger.Info("Already indexed ", path)
   293  		return
   294  	}
   295  	proj := IndexedProject{
   296  		Root:  path,
   297  		Paths: make([]string, 0, indexStartCapacity),
   298  	}
   299  	gitIgnore, err := ParseGitIgnore(path)
   300  	if err != nil {
   301  		// Non-fatal error
   302  		g.logger.Error("Error parsing gitignore", err)
   303  	}
   304  	g.index(&proj, path, gitIgnore)
   305  }
   306  
   307  // Here is the method where we extract the code above
   308  func (g *BasicSearch) serializeProjectIndex(proj *IndexedProject) {
   309  	searchDir, err := proj.makeSearchDir()
   310  	if err != nil {
   311  		g.logger.Error("Error making search dir: ", err)
   312  		return
   313  	}
   314  	saveFilePath := filepath.Join(searchDir, saveFileName)
   315  	outBytes, err2 := json.Marshal(proj)
   316  	if err2 != nil {
   317  		g.logger.Error("Error serializing index to json", "index", *proj)
   318  	}
   319  	err3 := os.WriteFile(saveFilePath, outBytes, 0644)
   320  	if err3 != nil {
   321  		g.logger.Error("Error saving index", "save_path", saveFilePath)
   322  	}
   323  	g.logger.Info("Indexed " + strconv.Itoa(len(proj.Paths)) + " files for " + proj.Root)
   324  
   325  	// Now serialize zoekt index
   326  
   327  	// First delete the old index
   328  	g.deleteIndexFile(proj)
   329  
   330  	indexer, err := zoekt.NewIndexBuilder(nil)
   331  	if err != nil {
   332  		g.logger.Error("Error creating search index")
   333  		return
   334  	}
   335  
   336  	// Next create the code search index
   337  	// TODO - add some useful repository metadata
   338  	for _, path := range proj.Paths {
   339  		// Prepend Root to path
   340  		fullPath := filepath.Join(proj.Root, path)
   341  		absPath, err := absolute(fullPath)
   342  		if err != nil {
   343  			g.logger.Error("Error getting absolute path - file may not exist", path)
   344  		} else {
   345  			bytes, err := ioutil.ReadFile(absPath)
   346  			if err != nil {
   347  				g.logger.Error("Error reading file ", absPath)
   348  			} else {
   349  				indexer.AddFile(absPath, bytes)
   350  			}
   351  		}
   352  	}
   353  	indexFile, err := g.getIndexFile(proj)
   354  	if err != nil {
   355  		g.logger.Error("Error getting index file ", proj.Root)
   356  		return
   357  	}
   358  	defer indexFile.Close()
   359  	err = indexer.Write(indexFile)
   360  	if err != nil {
   361  		g.logger.Error("Error writing index file ", proj.Root)
   362  		g.logger.Error("Error: ", err.Error())
   363  		return
   364  	}
   365  }
   366  
   367  func (g *BasicSearch) Load(projectRoot string) (*IndexedProject, error) {
   368  	project := &IndexedProject{
   369  		Root:  projectRoot,
   370  		Paths: make([]string, 0, indexStartCapacity),
   371  	}
   372  	searchDir, err := project.makeSearchDir()
   373  	if err != nil {
   374  		g.logger.Error("Error making search dir: ", err)
   375  		return nil, err
   376  	}
   377  	saveFilePath := filepath.Join(searchDir, saveFileName)
   378  	stat, err := os.Lstat(saveFilePath)
   379  	if err != nil {
   380  		if errors.Is(err, os.ErrNotExist) {
   381  			g.projects[projectRoot] = *project
   382  			// Return empty struct
   383  			return project, nil
   384  		}
   385  		return nil, err
   386  	}
   387  	// Check if index is stale
   388  	if time.Since(stat.ModTime()).Minutes() > indexCacheMinutes {
   389  		g.logger.Info("Index is stale for " + projectRoot)
   390  		return project, nil
   391  	}
   392  
   393  	file, err := os.Open(saveFilePath)
   394  	if err != nil {
   395  		g.logger.Error("Error opening index file.")
   396  		return nil, err
   397  	}
   398  	defer file.Close()
   399  	byteValue, _ := ioutil.ReadAll(file)
   400  	err = json.Unmarshal(byteValue, project)
   401  	if err != nil {
   402  		g.logger.Error("Error deserializing index file: %v", err)
   403  		return nil, err
   404  	}
   405  	g.logger.Info("Loaded " + strconv.Itoa(len(project.Paths)) + " files for " + projectRoot)
   406  	g.updater.c <- project
   407  	return project, nil
   408  }
   409  
   410  func (g *BasicSearch) GetFiles(projectRoot string) (*IndexedProject, error) {
   411  	project, ok := g.projects[projectRoot]
   412  	if !ok {
   413  		return nil, errors.New("Project not indexed " + projectRoot)
   414  	}
   415  	return &project, nil
   416  }
   417  
   418  func (g *BasicSearch) Search(projectRoot string, data string, duration string) (*SearchResults, error) {
   419  	project, ok := g.projects[projectRoot]
   420  	var knownProjects string
   421  	for k := range g.projects {
   422  		knownProjects += k
   423  	}
   424  	if !ok {
   425  		return nil, errors.New("Unknown project " + projectRoot + ". Known projects: " + knownProjects)
   426  	}
   427  	// Open index file
   428  	searchDir, err := project.makeSearchDir()
   429  	if err != nil {
   430  		g.logger.Error("Error making search dir: ", err)
   431  		return nil, err
   432  	}
   433  	indexPath := filepath.Join(searchDir, codeSearchIndexFileName)
   434  	indexFile, err := os.Open(indexPath)
   435  	if err != nil {
   436  		g.logger.Error("Error opening index file ", indexPath)
   437  		return nil, err
   438  	}
   439  	defer indexFile.Close()
   440  	index, err2 := zoekt.NewIndexFile(indexFile)
   441  	if err2 != nil {
   442  		g.logger.Error("Error reading index file ", indexPath)
   443  		return nil, err2
   444  	}
   445  	// Search
   446  	searcher, err := zoekt.NewSearcher(index)
   447  	if err != nil {
   448  		g.logger.Error("Error creating searcher", err)
   449  		return nil, err
   450  	}
   451  	defer searcher.Close()
   452  
   453  	var (
   454  		ctx    context.Context
   455  		cancel context.CancelFunc
   456  	)
   457  
   458  	timeout, err := time.ParseDuration(duration)
   459  	if err == nil {
   460  		// The request has a timeout, so create a context that is
   461  		// canceled automatically when the timeout expires.
   462  		ctx, cancel = context.WithTimeout(context.Background(), timeout)
   463  	} else {
   464  		ctx, cancel = context.WithCancel(context.Background())
   465  	}
   466  	defer cancel()
   467  
   468  	query, err := query.Parse(data)
   469  	if err != nil {
   470  		g.logger.Error("Error parsing query")
   471  		return nil, err
   472  	}
   473  
   474  	queryOptions := &zoekt.SearchOptions{
   475  		ChunkMatches: true,
   476  	}
   477  
   478  	result, err := searcher.Search(ctx, query, queryOptions)
   479  	if err != nil {
   480  		g.logger.Error("Error searching index")
   481  		return nil, err
   482  	}
   483  
   484  	// Create PogoFileMatch array of same size as result.Files
   485  	fileMatches := make([]PogoFileMatch, len(result.Files))
   486  
   487  	for i, file := range result.Files {
   488  		chunkMatches := make([]PogoChunkMatch, len(file.ChunkMatches))
   489  		for j, match := range file.ChunkMatches {
   490  			chunkMatches[j] = PogoChunkMatch{
   491  				Line:    match.ContentStart.LineNumber,
   492  				Content: "",
   493  			}
   494  			if len(match.Content) > 0 {
   495  				chunkMatches[j].Content = strings.TrimSpace(string(match.Content))
   496  			}
   497  		}
   498  		fileMatches[i] = PogoFileMatch{
   499  			Path:    strings.Replace(file.FileName, projectRoot, "", 1),
   500  			Matches: chunkMatches,
   501  		}
   502  	}
   503  	return &SearchResults{
   504  		Files: fileMatches,
   505  	}, nil
   506  }