github.com/suntong/cascadia@v1.3.0/cascadia_main.go (about) 1 //////////////////////////////////////////////////////////////////////////// 2 // Program: cascadia 3 // Purpose: go cascadia CSS selection from command line 4 // Authors: Tong Sun (c) 2016-2023, All rights reserved 5 //////////////////////////////////////////////////////////////////////////// 6 7 //go:generate sh -v cascadia_cliGen.sh 8 9 package main 10 11 import ( 12 "fmt" 13 "io" 14 "os" 15 "strings" 16 17 "github.com/PuerkitoBio/goquery" 18 "github.com/andybalholm/cascadia" 19 "github.com/mkideal/cli" 20 "golang.org/x/net/html" 21 ) 22 23 //////////////////////////////////////////////////////////////////////////// 24 // Constant and data type/structure definitions 25 26 const ( 27 IsRaw = "RAW" 28 WrapHTMLEnd = `</body>` 29 ) 30 31 // The OptsT type defines all the configurable options from cli. 32 type OptsT struct { 33 CSS []string 34 TextOut bool 35 TextRaw bool 36 Piece PieceStyleMap 37 Deli string 38 WrapHTML bool 39 Style string 40 Base string 41 Quiet bool 42 Verbose int 43 } 44 45 //////////////////////////////////////////////////////////////////////////// 46 // Global variables definitions 47 48 var ( 49 progname = "cascadia" 50 version = "1.3.0" 51 date = "2023-06-30" 52 53 rootArgv *rootT 54 // Opts store all the configurable options 55 Opts OptsT 56 ) 57 58 var WrapHTMLBeg string 59 60 //////////////////////////////////////////////////////////////////////////// 61 // Function definitions 62 63 func main() { 64 cli.SetUsageStyle(cli.DenseNormalStyle) 65 if err := cli.Root(root).Run(os.Args[1:]); err != nil { 66 fmt.Fprintln(os.Stderr, err) 67 } 68 fmt.Println("") 69 } 70 71 //========================================================================== 72 // css selection 73 74 func CascadiaC(ctx *cli.Context) error { 75 // ctx.JSON(ctx.RootArgv()) 76 // fmt.Println() 77 // ctx.JSON(ctx.Argv()) 78 // fmt.Println() 79 80 argv := ctx.Argv().(*rootT) 81 WrapHTMLBeg = fmt.Sprintf(`<!DOCTYPE html> 82 <html> 83 <head> 84 <meta charset="utf-8"> 85 <base href="%s"> 86 %s 87 </head> 88 <body>`, argv.Base, argv.Style) 89 90 Opts.CSS, Opts.Piece, Opts.Deli, 91 Opts.WrapHTML, Opts.TextOut, Opts.TextRaw, Opts.Quiet = 92 argv.CSS, argv.Piece, argv.Deli, 93 argv.WrapHTML, argv.TextOut, argv.TextRaw, argv.Quiet 94 Cascadia(argv.Filei, argv.Fileo, Opts) 95 argv.Filei.Close() 96 argv.Fileo.Close() 97 return nil 98 } 99 100 //-------------------------------------------------------------------------- 101 102 // Cascadia filters the input buffer/stream `bi` with CSS selectors array `Opts.CSS` and write to the output buffer/stream `bw`. 103 func Cascadia(bi io.Reader, bw io.Writer, Opts OptsT) error { 104 cssa, piece, deli, wrapHTML, textOut, textRaw, beQuiet := 105 Opts.CSS, Opts.Piece, Opts.Deli, 106 Opts.WrapHTML, Opts.TextOut, Opts.TextRaw, Opts.Quiet 107 if wrapHTML { 108 fmt.Fprintln(bw, WrapHTMLBeg) 109 } 110 if len(piece.Values) == 0 { 111 // no sub CSS selectors -- none-block selection mode 112 if textOut { 113 doc, err := goquery.NewDocumentFromReader(bi) 114 abortOn("Input", err) 115 116 for _, css := range cssa { 117 // Process each item block 118 doc.Find(css).Each(func(index int, item *goquery.Selection) { 119 //fmt.Printf("] #%d: %s\n", index, item.Text()) 120 if textRaw { 121 fmt.Fprintf(bw, "%s%s", 122 item.Text(), deli) 123 } else { 124 fmt.Fprintf(bw, "%s%s", 125 strings.TrimSpace(item.Text()), deli) 126 } 127 fmt.Fprintf(bw, "\n") 128 }) 129 } 130 } else { 131 doc, err := html.Parse(bi) 132 abortOn("Input", err) 133 for _, css := range cssa { 134 c, err := cascadia.Compile(css) 135 abortOn("CSS Selector string "+css, err) 136 137 // https://godoc.org/github.com/andybalholm/cascadia 138 ns := c.MatchAll(doc) 139 if !beQuiet { 140 fmt.Fprintf(os.Stderr, "%d elements for '%s':\n", len(ns), css) 141 } 142 for _, n := range ns { 143 html.Render(bw, n) 144 fmt.Fprintf(bw, "\n") 145 } 146 } 147 } 148 } else { 149 // have sub CSS selectors within -css -- block selection mode 150 // fmt.Printf("%v\n", piece) 151 152 // https://godoc.org/github.com/PuerkitoBio/goquery 153 // for debug 154 //doc, err := goquery.NewDocumentFromReader(strings.NewReader(testhtml)) 155 doc, err := goquery.NewDocumentFromReader(bi) 156 abortOn("Input", err) 157 158 // Print csv headers 159 for _, key := range piece.Keys { 160 fmt.Fprintf(bw, "%s%s", key, deli) 161 } 162 fmt.Fprintf(bw, "\n") 163 164 // Process each item block 165 doc.Find(cssa[0]).Each(func(index int, item *goquery.Selection) { 166 //fmt.Printf("] #%d: %s\n", index, item.Text()) 167 for _, key := range piece.Keys { 168 //fmt.Printf("] %s: %s\n", key, piece.Values[key]) 169 switch piece.PieceStyles[key] { 170 case PieceStyleRAW: 171 html.Render(bw, item.Find(piece.Values[key]).Get(0)) 172 fmt.Fprintf(bw, deli) 173 case PieceStyleATTR: 174 fmt.Fprintf(bw, "%s%s", 175 item.AttrOr(piece.Values[key], ""), deli) 176 case PieceStyleTEXT: 177 fmt.Fprintf(bw, "%s%s", 178 item.Find(piece.Values[key]).Contents().Text(), deli) 179 } 180 } 181 fmt.Fprintf(bw, "\n") 182 }) 183 } 184 if wrapHTML { 185 fmt.Fprintln(bw, WrapHTMLEnd) 186 } 187 return nil 188 } 189 190 //========================================================================== 191 // support functions 192 193 // abortOn will quit on anticipated errors gracefully without stack trace 194 func abortOn(errCase string, e error) { 195 if e != nil { 196 fmt.Printf("[%s] %s error: %v\n", progname, errCase, e) 197 os.Exit(1) 198 } 199 } 200 201 //========================================================================== 202 // for debug 203 // echo a | cascadia -i -o -c "[align=\"justify\"]" -p Bold="b" 204 205 const testhtml = ` 206 <div class="container"> 207 <div class="row"> 208 <div class="col-lg-8"> 209 <p align="justify"><b>Name</b>Priyaka</p> 210 <p align="justify"><b>Surname</b>Patil</p> 211 <p align="justify"><b>Adress</b><br>India,Kolhapur</p> 212 <p align="justify"><b>Hobbies </b><br>Playing</p> 213 <p align="justify"><b>Eduction</b><br>12th</p> 214 <p align="justify"><b>School</b><br>New Highschool</p> 215 </div> 216 </div> 217 </div> 218 `