github.com/hscells/guru@v0.0.0-20200207042420-2dabeb950d69/xtrecresults.go (about)

     1  package guru
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/binary"
     6  	"fmt"
     7  	"github.com/hscells/trecresults"
     8  	"io"
     9  )
    10  
    11  var MaxTopicWidth = 10
    12  var MaxDocIDWidth = 8
    13  
    14  // ReadCompressedTrecResultFile reads a compressed trec results (run) file.
    15  // File format is as follows:
    16  //
    17  // HEADER
    18  // ------
    19  // |byte        |byte       |'\n'              |
    20  // |topic width |docID width|new line character|
    21  //
    22  // BODY
    23  // ----
    24  // |`topic width` bytes|8 bytes                        |k*`docID width` bytes|'0000000\n'|
    25  // |topic              |number of documents (uint64), k|document IDs         |padding    |
    26  //
    27  // The HEADER section is written once and defines how the file should be read.
    28  // The BODY section should be written to a single line such that multiple BODY sections
    29  // may be concatenated together manually and the file can still be decompressed.
    30  func ReadCompressedTrecResultFile(r io.Reader) (*trecresults.ResultFile, error) {
    31  	resultFile := trecresults.NewResultFile()
    32  
    33  	// Read first three bytes: [topic width, doc ID width, '\n'].
    34  	p := make([]byte, 3)
    35  	_, err := r.Read(p)
    36  	if err != nil {
    37  		return nil, err
    38  	}
    39  	topicWidth := int(p[0])
    40  	docIDWidth := int(p[1])
    41  
    42  	var (
    43  		tok   int
    44  		topic string
    45  		k     int
    46  	)
    47  
    48  	// Read next set of bytes: [topic (string of size 'topic width'), number of results (uint64)].
    49  	p = make([]byte, topicWidth+8)
    50  	for {
    51  		_, err = r.Read(p)
    52  		if err != nil && err != io.EOF {
    53  			return nil, err
    54  		}
    55  		if err == io.EOF {
    56  			return resultFile, nil
    57  		}
    58  
    59  		if tok == 0 {
    60  			topic = fixedWidthStringify(p[:topicWidth])                 // First half of bytes is the topic.
    61  			k = int(int64(binary.BigEndian.Uint64(p[topicWidth:])))     // Second half is the number of document IDs.
    62  			resultFile.Results[topic] = make(trecresults.ResultList, k) // Allocate memory for the document IDs.
    63  			p = make([]byte, (k*docIDWidth)+docIDWidth)                 // Allocate the bytes for the next read, which will contain the document IDs.
    64  			tok = 1
    65  		} else {
    66  			for i, j := 0, 0; i < len(p)-docIDWidth; i += docIDWidth { // Iterate over each [doc ID (string of size 'doc ID width')].
    67  				v := p[i : i+docIDWidth]
    68  				if v[0] == 0 {
    69  					fmt.Println(v)
    70  					break
    71  				}
    72  				resultFile.Results[topic][j] = &trecresults.Result{
    73  					DocId: fixedWidthStringify(v),
    74  				}
    75  				j++
    76  			}
    77  			p = make([]byte, topicWidth+8) // Ready to read the next topic, so allocate the bytes for it.
    78  			tok = 0
    79  		}
    80  	}
    81  }
    82  
    83  // WriteCompressedTrecResultFile writes a result file in memory to disk.
    84  // See `ReadCompressedTrecResultFile` for file format.
    85  func WriteCompressedTrecResultFile(w io.Writer, res trecresults.ResultFile) (int, error) {
    86  	// The first three bytes are the topic width, the doc ID width, and a newline character.
    87  	n, err := w.Write([]byte{byte(MaxTopicWidth), byte(MaxDocIDWidth), '\n'})
    88  	if err != nil {
    89  		return n, err
    90  	}
    91  	for topic, resultList := range res.Results {
    92  
    93  		// Write the topic at the start of the line.
    94  		l, err := w.Write(fixWidthString(topic, MaxTopicWidth))
    95  		if err != nil {
    96  			return n, err
    97  		}
    98  		n += l
    99  
   100  		p := make([]byte, 8)
   101  		binary.BigEndian.PutUint64(p, uint64(len(resultList)))
   102  		l, err = w.Write(p)
   103  		if err != nil {
   104  			return n, err
   105  		}
   106  		l += n
   107  
   108  		// Write each docID in fixed bytes.
   109  		var buff bytes.Buffer
   110  		for _, result := range resultList {
   111  			_, err := buff.Write(fixWidthString(result.DocId, MaxDocIDWidth))
   112  			if err != nil {
   113  				return n, err
   114  			}
   115  		}
   116  		// That's the end of a result list, the next one will appear on the next line.
   117  		endDoc := make([]byte, MaxDocIDWidth)
   118  		endDoc[len(endDoc)-1] = '\n'
   119  		_, err = buff.Write(endDoc)
   120  		if err != nil {
   121  			return n, err
   122  		}
   123  
   124  		l, err = w.Write(buff.Bytes())
   125  		if err != nil {
   126  			return n, err
   127  		}
   128  		n += l
   129  
   130  	}
   131  	return n, nil
   132  }
   133  
   134  // fixWidthString converts a string to a fixed width byte slice.
   135  // The byte slice is padded with 0s.
   136  func fixWidthString(s string, width int) []byte {
   137  	b := make([]byte, width)
   138  	for i := 0; i < width; i++ {
   139  		if i < len(s) {
   140  			b[i] = s[i]
   141  		} else {
   142  			b[i] = 0
   143  		}
   144  	}
   145  	return b
   146  }
   147  
   148  // fixedWidthStringify converts a byte slice to a string, removing padded 0s.
   149  func fixedWidthStringify(b []byte) string {
   150  	var s string
   151  	for _, v := range b {
   152  		if v == 0 {
   153  			return s
   154  		}
   155  		s += string(v)
   156  	}
   157  	return s
   158  }