github.com/hscells/guru@v0.0.0-20200207042420-2dabeb950d69/xtrecresults.go (about) 1 package guru 2 3 import ( 4 "bytes" 5 "encoding/binary" 6 "fmt" 7 "github.com/hscells/trecresults" 8 "io" 9 ) 10 11 var MaxTopicWidth = 10 12 var MaxDocIDWidth = 8 13 14 // ReadCompressedTrecResultFile reads a compressed trec results (run) file. 15 // File format is as follows: 16 // 17 // HEADER 18 // ------ 19 // |byte |byte |'\n' | 20 // |topic width |docID width|new line character| 21 // 22 // BODY 23 // ---- 24 // |`topic width` bytes|8 bytes |k*`docID width` bytes|'0000000\n'| 25 // |topic |number of documents (uint64), k|document IDs |padding | 26 // 27 // The HEADER section is written once and defines how the file should be read. 28 // The BODY section should be written to a single line such that multiple BODY sections 29 // may be concatenated together manually and the file can still be decompressed. 30 func ReadCompressedTrecResultFile(r io.Reader) (*trecresults.ResultFile, error) { 31 resultFile := trecresults.NewResultFile() 32 33 // Read first three bytes: [topic width, doc ID width, '\n']. 34 p := make([]byte, 3) 35 _, err := r.Read(p) 36 if err != nil { 37 return nil, err 38 } 39 topicWidth := int(p[0]) 40 docIDWidth := int(p[1]) 41 42 var ( 43 tok int 44 topic string 45 k int 46 ) 47 48 // Read next set of bytes: [topic (string of size 'topic width'), number of results (uint64)]. 49 p = make([]byte, topicWidth+8) 50 for { 51 _, err = r.Read(p) 52 if err != nil && err != io.EOF { 53 return nil, err 54 } 55 if err == io.EOF { 56 return resultFile, nil 57 } 58 59 if tok == 0 { 60 topic = fixedWidthStringify(p[:topicWidth]) // First half of bytes is the topic. 61 k = int(int64(binary.BigEndian.Uint64(p[topicWidth:]))) // Second half is the number of document IDs. 62 resultFile.Results[topic] = make(trecresults.ResultList, k) // Allocate memory for the document IDs. 63 p = make([]byte, (k*docIDWidth)+docIDWidth) // Allocate the bytes for the next read, which will contain the document IDs. 64 tok = 1 65 } else { 66 for i, j := 0, 0; i < len(p)-docIDWidth; i += docIDWidth { // Iterate over each [doc ID (string of size 'doc ID width')]. 67 v := p[i : i+docIDWidth] 68 if v[0] == 0 { 69 fmt.Println(v) 70 break 71 } 72 resultFile.Results[topic][j] = &trecresults.Result{ 73 DocId: fixedWidthStringify(v), 74 } 75 j++ 76 } 77 p = make([]byte, topicWidth+8) // Ready to read the next topic, so allocate the bytes for it. 78 tok = 0 79 } 80 } 81 } 82 83 // WriteCompressedTrecResultFile writes a result file in memory to disk. 84 // See `ReadCompressedTrecResultFile` for file format. 85 func WriteCompressedTrecResultFile(w io.Writer, res trecresults.ResultFile) (int, error) { 86 // The first three bytes are the topic width, the doc ID width, and a newline character. 87 n, err := w.Write([]byte{byte(MaxTopicWidth), byte(MaxDocIDWidth), '\n'}) 88 if err != nil { 89 return n, err 90 } 91 for topic, resultList := range res.Results { 92 93 // Write the topic at the start of the line. 94 l, err := w.Write(fixWidthString(topic, MaxTopicWidth)) 95 if err != nil { 96 return n, err 97 } 98 n += l 99 100 p := make([]byte, 8) 101 binary.BigEndian.PutUint64(p, uint64(len(resultList))) 102 l, err = w.Write(p) 103 if err != nil { 104 return n, err 105 } 106 l += n 107 108 // Write each docID in fixed bytes. 109 var buff bytes.Buffer 110 for _, result := range resultList { 111 _, err := buff.Write(fixWidthString(result.DocId, MaxDocIDWidth)) 112 if err != nil { 113 return n, err 114 } 115 } 116 // That's the end of a result list, the next one will appear on the next line. 117 endDoc := make([]byte, MaxDocIDWidth) 118 endDoc[len(endDoc)-1] = '\n' 119 _, err = buff.Write(endDoc) 120 if err != nil { 121 return n, err 122 } 123 124 l, err = w.Write(buff.Bytes()) 125 if err != nil { 126 return n, err 127 } 128 n += l 129 130 } 131 return n, nil 132 } 133 134 // fixWidthString converts a string to a fixed width byte slice. 135 // The byte slice is padded with 0s. 136 func fixWidthString(s string, width int) []byte { 137 b := make([]byte, width) 138 for i := 0; i < width; i++ { 139 if i < len(s) { 140 b[i] = s[i] 141 } else { 142 b[i] = 0 143 } 144 } 145 return b 146 } 147 148 // fixedWidthStringify converts a byte slice to a string, removing padded 0s. 149 func fixedWidthStringify(b []byte) string { 150 var s string 151 for _, v := range b { 152 if v == 0 { 153 return s 154 } 155 s += string(v) 156 } 157 return s 158 }