github.com/suntong/cascadia@v1.3.0/cascadia_main.go (about)

     1  ////////////////////////////////////////////////////////////////////////////
     2  // Program: cascadia
     3  // Purpose: go cascadia CSS selection from command line
     4  // Authors: Tong Sun (c) 2016-2023, All rights reserved
     5  ////////////////////////////////////////////////////////////////////////////
     6  
     7  //go:generate sh -v cascadia_cliGen.sh
     8  
     9  package main
    10  
    11  import (
    12  	"fmt"
    13  	"io"
    14  	"os"
    15  	"strings"
    16  
    17  	"github.com/PuerkitoBio/goquery"
    18  	"github.com/andybalholm/cascadia"
    19  	"github.com/mkideal/cli"
    20  	"golang.org/x/net/html"
    21  )
    22  
    23  ////////////////////////////////////////////////////////////////////////////
    24  // Constant and data type/structure definitions
    25  
    26  const (
    27  	IsRaw       = "RAW"
    28  	WrapHTMLEnd = `</body>`
    29  )
    30  
    31  // The OptsT type defines all the configurable options from cli.
    32  type OptsT struct {
    33  	CSS      []string
    34  	TextOut  bool
    35  	TextRaw  bool
    36  	Piece    PieceStyleMap
    37  	Deli     string
    38  	WrapHTML bool
    39  	Style    string
    40  	Base     string
    41  	Quiet    bool
    42  	Verbose  int
    43  }
    44  
    45  ////////////////////////////////////////////////////////////////////////////
    46  // Global variables definitions
    47  
    48  var (
    49  	progname = "cascadia"
    50  	version  = "1.3.0"
    51  	date     = "2023-06-30"
    52  
    53  	rootArgv *rootT
    54  	// Opts store all the configurable options
    55  	Opts OptsT
    56  )
    57  
    58  var WrapHTMLBeg string
    59  
    60  ////////////////////////////////////////////////////////////////////////////
    61  // Function definitions
    62  
    63  func main() {
    64  	cli.SetUsageStyle(cli.DenseNormalStyle)
    65  	if err := cli.Root(root).Run(os.Args[1:]); err != nil {
    66  		fmt.Fprintln(os.Stderr, err)
    67  	}
    68  	fmt.Println("")
    69  }
    70  
    71  //==========================================================================
    72  // css selection
    73  
    74  func CascadiaC(ctx *cli.Context) error {
    75  	// ctx.JSON(ctx.RootArgv())
    76  	// fmt.Println()
    77  	// ctx.JSON(ctx.Argv())
    78  	// fmt.Println()
    79  
    80  	argv := ctx.Argv().(*rootT)
    81  	WrapHTMLBeg = fmt.Sprintf(`<!DOCTYPE html>
    82  <html>
    83  <head>
    84  <meta charset="utf-8">
    85  <base href="%s">
    86  %s
    87  </head>
    88  <body>`, argv.Base, argv.Style)
    89  
    90  	Opts.CSS, Opts.Piece, Opts.Deli,
    91  		Opts.WrapHTML, Opts.TextOut, Opts.TextRaw, Opts.Quiet =
    92  		argv.CSS, argv.Piece, argv.Deli,
    93  		argv.WrapHTML, argv.TextOut, argv.TextRaw, argv.Quiet
    94  	Cascadia(argv.Filei, argv.Fileo, Opts)
    95  	argv.Filei.Close()
    96  	argv.Fileo.Close()
    97  	return nil
    98  }
    99  
   100  //--------------------------------------------------------------------------
   101  
   102  // Cascadia filters the input buffer/stream `bi` with CSS selectors array `Opts.CSS` and write to the output buffer/stream `bw`.
   103  func Cascadia(bi io.Reader, bw io.Writer, Opts OptsT) error {
   104  	cssa, piece, deli, wrapHTML, textOut, textRaw, beQuiet :=
   105  		Opts.CSS, Opts.Piece, Opts.Deli,
   106  		Opts.WrapHTML, Opts.TextOut, Opts.TextRaw, Opts.Quiet
   107  	if wrapHTML {
   108  		fmt.Fprintln(bw, WrapHTMLBeg)
   109  	}
   110  	if len(piece.Values) == 0 {
   111  		// no sub CSS selectors -- none-block selection mode
   112  		if textOut {
   113  			doc, err := goquery.NewDocumentFromReader(bi)
   114  			abortOn("Input", err)
   115  
   116  			for _, css := range cssa {
   117  				// Process each item block
   118  				doc.Find(css).Each(func(index int, item *goquery.Selection) {
   119  					//fmt.Printf("] #%d: %s\n", index, item.Text())
   120  					if textRaw {
   121  						fmt.Fprintf(bw, "%s%s",
   122  							item.Text(), deli)
   123  					} else {
   124  						fmt.Fprintf(bw, "%s%s",
   125  							strings.TrimSpace(item.Text()), deli)
   126  					}
   127  					fmt.Fprintf(bw, "\n")
   128  				})
   129  			}
   130  		} else {
   131  			doc, err := html.Parse(bi)
   132  			abortOn("Input", err)
   133  			for _, css := range cssa {
   134  				c, err := cascadia.Compile(css)
   135  				abortOn("CSS Selector string "+css, err)
   136  
   137  				// https://godoc.org/github.com/andybalholm/cascadia
   138  				ns := c.MatchAll(doc)
   139  				if !beQuiet {
   140  					fmt.Fprintf(os.Stderr, "%d elements for '%s':\n", len(ns), css)
   141  				}
   142  				for _, n := range ns {
   143  					html.Render(bw, n)
   144  					fmt.Fprintf(bw, "\n")
   145  				}
   146  			}
   147  		}
   148  	} else {
   149  		// have sub CSS selectors within -css -- block selection mode
   150  		// fmt.Printf("%v\n", piece)
   151  
   152  		// https://godoc.org/github.com/PuerkitoBio/goquery
   153  		// for debug
   154  		//doc, err := goquery.NewDocumentFromReader(strings.NewReader(testhtml))
   155  		doc, err := goquery.NewDocumentFromReader(bi)
   156  		abortOn("Input", err)
   157  
   158  		// Print csv headers
   159  		for _, key := range piece.Keys {
   160  			fmt.Fprintf(bw, "%s%s", key, deli)
   161  		}
   162  		fmt.Fprintf(bw, "\n")
   163  
   164  		// Process each item block
   165  		doc.Find(cssa[0]).Each(func(index int, item *goquery.Selection) {
   166  			//fmt.Printf("] #%d: %s\n", index, item.Text())
   167  			for _, key := range piece.Keys {
   168  				//fmt.Printf("] %s: %s\n", key, piece.Values[key])
   169  				switch piece.PieceStyles[key] {
   170  				case PieceStyleRAW:
   171  					html.Render(bw, item.Find(piece.Values[key]).Get(0))
   172  					fmt.Fprintf(bw, deli)
   173  				case PieceStyleATTR:
   174  					fmt.Fprintf(bw, "%s%s",
   175  						item.AttrOr(piece.Values[key], ""), deli)
   176  				case PieceStyleTEXT:
   177  					fmt.Fprintf(bw, "%s%s",
   178  						item.Find(piece.Values[key]).Contents().Text(), deli)
   179  				}
   180  			}
   181  			fmt.Fprintf(bw, "\n")
   182  		})
   183  	}
   184  	if wrapHTML {
   185  		fmt.Fprintln(bw, WrapHTMLEnd)
   186  	}
   187  	return nil
   188  }
   189  
   190  //==========================================================================
   191  // support functions
   192  
   193  // abortOn will quit on anticipated errors gracefully without stack trace
   194  func abortOn(errCase string, e error) {
   195  	if e != nil {
   196  		fmt.Printf("[%s] %s error: %v\n", progname, errCase, e)
   197  		os.Exit(1)
   198  	}
   199  }
   200  
   201  //==========================================================================
   202  // for debug
   203  // echo a | cascadia -i -o -c "[align=\"justify\"]" -p Bold="b"
   204  
   205  const testhtml = `
   206  <div class="container">
   207      <div class="row">
   208        <div class="col-lg-8">
   209          <p align="justify"><b>Name</b>Priyaka</p>
   210          <p align="justify"><b>Surname</b>Patil</p>
   211          <p align="justify"><b>Adress</b><br>India,Kolhapur</p>
   212          <p align="justify"><b>Hobbies&nbsp;</b><br>Playing</p>
   213          <p align="justify"><b>Eduction</b><br>12th</p>
   214          <p align="justify"><b>School</b><br>New Highschool</p>
   215         </div>
   216      </div>
   217  </div>
   218  `