github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/extractor/text.go (about) 1 /* 2 * This file is subject to the terms and conditions defined in 3 * file 'LICENSE.md', which is part of this source code package. 4 */ 5 6 package extractor 7 8 import ( 9 "bytes" 10 "errors" 11 "fmt" 12 13 "github.com/unidoc/unidoc/common" 14 "github.com/unidoc/unidoc/pdf/contentstream" 15 "github.com/unidoc/unidoc/pdf/core" 16 "github.com/unidoc/unidoc/pdf/internal/cmap" 17 "github.com/unidoc/unidoc/pdf/model" 18 ) 19 20 // ExtractText processes and extracts all text data in content streams and returns as a string. Takes into 21 // account character encoding via CMaps in the PDF file. 22 // The text is processed linearly e.g. in the order in which it appears. A best effort is done to add 23 // spaces and newlines. 24 func (e *Extractor) ExtractText() (string, error) { 25 var buf bytes.Buffer 26 27 cstreamParser := contentstream.NewContentStreamParser(e.contents) 28 operations, err := cstreamParser.Parse() 29 if err != nil { 30 return buf.String(), err 31 } 32 33 processor := contentstream.NewContentStreamProcessor(*operations) 34 35 var codemap *cmap.CMap 36 inText := false 37 xPos, yPos := float64(-1), float64(-1) 38 39 processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "", 40 func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState, resources *model.PdfPageResources) error { 41 operand := op.Operand 42 switch operand { 43 case "BT": 44 inText = true 45 case "ET": 46 inText = false 47 case "Tf": 48 if !inText { 49 common.Log.Debug("Tf operand outside text") 50 return nil 51 } 52 53 if len(op.Params) != 2 { 54 common.Log.Debug("Error Tf should only get 2 input params, got %d", len(op.Params)) 55 return errors.New("Incorrect parameter count") 56 } 57 58 codemap = nil 59 60 fontName, ok := op.Params[0].(*core.PdfObjectName) 61 if !ok { 62 common.Log.Debug("Error Tf font input not a name") 63 return errors.New("Tf range error") 64 } 65 66 if resources == nil { 67 return nil 68 } 69 70 fontObj, found := resources.GetFontByName(*fontName) 71 if !found { 72 common.Log.Debug("Font not found...") 73 return errors.New("Font not in resources") 74 } 75 76 fontObj = core.TraceToDirectObject(fontObj) 77 if fontDict, isDict := fontObj.(*core.PdfObjectDictionary); isDict { 78 toUnicode := fontDict.Get("ToUnicode") 79 if toUnicode != nil { 80 toUnicode = core.TraceToDirectObject(toUnicode) 81 toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream) 82 if !ok { 83 return errors.New("Invalid ToUnicode entry - not a stream") 84 } 85 decoded, err := core.DecodeStream(toUnicodeStream) 86 if err != nil { 87 return err 88 } 89 90 codemap, err = cmap.LoadCmapFromData(decoded) 91 if err != nil { 92 return err 93 } 94 } 95 } 96 case "T*": 97 if !inText { 98 common.Log.Debug("T* operand outside text") 99 return nil 100 } 101 buf.WriteString("\n") 102 case "Td", "TD": 103 if !inText { 104 common.Log.Debug("Td/TD operand outside text") 105 return nil 106 } 107 108 // Params: [tx ty], corresponeds to Tm=Tlm=[1 0 0;0 1 0;tx ty 1]*Tm 109 if len(op.Params) != 2 { 110 common.Log.Debug("Td/TD invalid arguments") 111 return nil 112 } 113 tx, err := getNumberAsFloat(op.Params[0]) 114 if err != nil { 115 common.Log.Debug("Td Float parse error") 116 return nil 117 } 118 ty, err := getNumberAsFloat(op.Params[1]) 119 if err != nil { 120 common.Log.Debug("Td Float parse error") 121 return nil 122 } 123 124 if tx > 0 { 125 buf.WriteString(" ") 126 } 127 if ty < 0 { 128 // TODO: More flexible space characters? 129 buf.WriteString("\n") 130 } 131 case "Tm": 132 if !inText { 133 common.Log.Debug("Tm operand outside text") 134 return nil 135 } 136 137 // Params: a,b,c,d,e,f as in Tm = [a b 0; c d 0; e f 1]. 138 // The last two (e,f) represent translation. 139 if len(op.Params) != 6 { 140 return errors.New("Tm: Invalid number of inputs") 141 } 142 xfloat, ok := op.Params[4].(*core.PdfObjectFloat) 143 if !ok { 144 xint, ok := op.Params[4].(*core.PdfObjectInteger) 145 if !ok { 146 return nil 147 } 148 xfloat = core.MakeFloat(float64(*xint)) 149 } 150 yfloat, ok := op.Params[5].(*core.PdfObjectFloat) 151 if !ok { 152 yint, ok := op.Params[5].(*core.PdfObjectInteger) 153 if !ok { 154 return nil 155 } 156 yfloat = core.MakeFloat(float64(*yint)) 157 } 158 if yPos == -1 { 159 yPos = float64(*yfloat) 160 } else if yPos > float64(*yfloat) { 161 buf.WriteString("\n") 162 xPos = float64(*xfloat) 163 yPos = float64(*yfloat) 164 return nil 165 } 166 if xPos == -1 { 167 xPos = float64(*xfloat) 168 } else if xPos < float64(*xfloat) { 169 buf.WriteString("\t") 170 xPos = float64(*xfloat) 171 } 172 case "TJ": 173 if !inText { 174 common.Log.Debug("TJ operand outside text") 175 return nil 176 } 177 if len(op.Params) < 1 { 178 return nil 179 } 180 paramList, ok := op.Params[0].(*core.PdfObjectArray) 181 if !ok { 182 return fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0]) 183 } 184 for _, obj := range *paramList { 185 switch v := obj.(type) { 186 case *core.PdfObjectString: 187 if codemap != nil { 188 buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*v))) 189 } else { 190 buf.WriteString(string(*v)) 191 } 192 case *core.PdfObjectFloat: 193 if *v < -100 { 194 buf.WriteString(" ") 195 } 196 case *core.PdfObjectInteger: 197 if *v < -100 { 198 buf.WriteString(" ") 199 } 200 } 201 } 202 case "Tj": 203 if !inText { 204 common.Log.Debug("Tj operand outside text") 205 return nil 206 } 207 if len(op.Params) < 1 { 208 return nil 209 } 210 param, ok := op.Params[0].(*core.PdfObjectString) 211 if !ok { 212 return fmt.Errorf("Invalid parameter type, not string (%T)", op.Params[0]) 213 } 214 if codemap != nil { 215 buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*param))) 216 } else { 217 buf.WriteString(string(*param)) 218 } 219 } 220 221 return nil 222 }) 223 224 err = processor.Process(e.resources) 225 if err != nil { 226 common.Log.Error("Error processing: %v", err) 227 return buf.String(), err 228 } 229 230 procBuf(&buf) 231 232 return buf.String(), nil 233 }