github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/contentstream/contentstream.go (about) 1 /* 2 * This file is subject to the terms and conditions defined in 3 * file 'LICENSE.md', which is part of this source code package. 4 */ 5 6 package contentstream 7 8 import ( 9 "bytes" 10 "fmt" 11 12 . "github.com/unidoc/unidoc/pdf/core" 13 ) 14 15 type ContentStreamOperation struct { 16 Params []PdfObject 17 Operand string 18 } 19 20 type ContentStreamOperations []*ContentStreamOperation 21 22 // Check if the content stream operations are fully wrapped (within q ... Q) 23 func (this *ContentStreamOperations) isWrapped() bool { 24 if len(*this) < 2 { 25 return false 26 } 27 28 depth := 0 29 for _, op := range *this { 30 if op.Operand == "q" { 31 depth++ 32 } else if op.Operand == "Q" { 33 depth-- 34 } else { 35 if depth < 1 { 36 return false 37 } 38 } 39 } 40 41 // Should end at depth == 0 42 return depth == 0 43 } 44 45 // Wrap entire contents within q ... Q. If unbalanced, then adds extra Qs at the end. 46 // Only does if needed. Ensures that when adding new content, one start with all states 47 // in the default condition. 48 func (this *ContentStreamOperations) WrapIfNeeded() *ContentStreamOperations { 49 if len(*this) == 0 { 50 // No need to wrap if empty. 51 return this 52 } 53 if this.isWrapped() { 54 return this 55 } 56 57 *this = append([]*ContentStreamOperation{{Operand: "q"}}, *this...) 58 59 depth := 0 60 for _, op := range *this { 61 if op.Operand == "q" { 62 depth++ 63 } else if op.Operand == "Q" { 64 depth-- 65 } 66 } 67 68 for depth > 0 { 69 *this = append(*this, &ContentStreamOperation{Operand: "Q"}) 70 depth-- 71 } 72 73 return this 74 } 75 76 // Convert a set of content stream operations to a content stream byte presentation, i.e. the kind that can be 77 // stored as a PDF stream or string format. 78 func (this *ContentStreamOperations) Bytes() []byte { 79 var buf bytes.Buffer 80 81 for _, op := range *this { 82 if op == nil { 83 continue 84 } 85 86 if op.Operand == "BI" { 87 // Inline image requires special handling. 88 buf.WriteString(op.Operand + "\n") 89 buf.WriteString(op.Params[0].DefaultWriteString()) 90 91 } else { 92 // Default handler. 93 for _, param := range op.Params { 94 buf.WriteString(param.DefaultWriteString()) 95 buf.WriteString(" ") 96 97 } 98 99 buf.WriteString(op.Operand + "\n") 100 } 101 } 102 103 return buf.Bytes() 104 } 105 106 // ExtractText parses and extracts all text data in content streams and returns as a string. 107 // Does not take into account Encoding table, the output is simply the character codes. 108 // 109 // Deprecated: More advanced text extraction is offered in package extractor with character encoding support. 110 func (this *ContentStreamParser) ExtractText() (string, error) { 111 operations, err := this.Parse() 112 if err != nil { 113 return "", err 114 } 115 inText := false 116 xPos, yPos := float64(-1), float64(-1) 117 txt := "" 118 for _, op := range *operations { 119 if op.Operand == "BT" { 120 inText = true 121 } else if op.Operand == "ET" { 122 inText = false 123 } 124 if op.Operand == "Td" || op.Operand == "TD" || op.Operand == "T*" { 125 // Move to next line... 126 txt += "\n" 127 } 128 if op.Operand == "Tm" { 129 if len(op.Params) != 6 { 130 continue 131 } 132 xfloat, ok := op.Params[4].(*PdfObjectFloat) 133 if !ok { 134 xint, ok := op.Params[4].(*PdfObjectInteger) 135 if !ok { 136 continue 137 } 138 xfloat = MakeFloat(float64(*xint)) 139 } 140 yfloat, ok := op.Params[5].(*PdfObjectFloat) 141 if !ok { 142 yint, ok := op.Params[5].(*PdfObjectInteger) 143 if !ok { 144 continue 145 } 146 yfloat = MakeFloat(float64(*yint)) 147 } 148 if yPos == -1 { 149 yPos = float64(*yfloat) 150 } else if yPos > float64(*yfloat) { 151 txt += "\n" 152 xPos = float64(*xfloat) 153 yPos = float64(*yfloat) 154 continue 155 } 156 if xPos == -1 { 157 xPos = float64(*xfloat) 158 } else if xPos < float64(*xfloat) { 159 txt += "\t" 160 xPos = float64(*xfloat) 161 } 162 } 163 if inText && op.Operand == "TJ" { 164 if len(op.Params) < 1 { 165 continue 166 } 167 paramList, ok := op.Params[0].(*PdfObjectArray) 168 if !ok { 169 return "", fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0]) 170 } 171 for _, obj := range *paramList { 172 switch v := obj.(type) { 173 case *PdfObjectString: 174 txt += string(*v) 175 case *PdfObjectFloat: 176 if *v < -100 { 177 txt += " " 178 } 179 case *PdfObjectInteger: 180 if *v < -100 { 181 txt += " " 182 } 183 } 184 } 185 } else if inText && op.Operand == "Tj" { 186 if len(op.Params) < 1 { 187 continue 188 } 189 param, ok := op.Params[0].(*PdfObjectString) 190 if !ok { 191 return "", fmt.Errorf("Invalid parameter type, not string (%T)", op.Params[0]) 192 } 193 txt += string(*param) 194 } 195 } 196 197 return txt, nil 198 }