github.com/attic-labs/noms@v0.0.0-20210827224422-e5fa29d95e8b/samples/go/decent/lib/importer.go (about)

     1  // See: https://github.com/attic-labs/noms/issues/3808
     2  // +build ignore
     3  
     4  // Copyright 2017 Attic Labs, Inc. All rights reserved.
     5  // Licensed under the Apache License, version 2.0:
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  
     8  package lib
     9  
    10  import (
    11  	"errors"
    12  	"fmt"
    13  	"os"
    14  	"path/filepath"
    15  	"regexp"
    16  	"sort"
    17  	"strings"
    18  
    19  	"github.com/attic-labs/noms/go/d"
    20  	"github.com/attic-labs/noms/go/marshal"
    21  	"github.com/attic-labs/noms/go/merge"
    22  	"github.com/attic-labs/noms/go/spec"
    23  	"github.com/attic-labs/noms/go/types"
    24  	"github.com/attic-labs/noms/go/util/datetime"
    25  	"golang.org/x/net/html"
    26  )
    27  
    28  var (
    29  	character = ""
    30  	msgs      = []Message{}
    31  )
    32  
    33  func RunImport(dir, dsSpec string) error {
    34  	filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
    35  		if path == dir {
    36  			return nil
    37  		}
    38  		if !strings.HasSuffix(info.Name(), ".html") {
    39  			return nil
    40  		}
    41  		fmt.Println("importing:", path)
    42  		f, err := os.Open(path)
    43  		d.Chk.NoError(err)
    44  		n, err := html.Parse(f)
    45  		d.Chk.NoError(err)
    46  		extractDialog(n)
    47  		return nil
    48  	})
    49  
    50  	if len(msgs) == 0 {
    51  		return errors.New("Failed to import any data")
    52  	}
    53  	fmt.Println("Imported", len(msgs), "messages")
    54  
    55  	sp, err := spec.ForDataset(dsSpec)
    56  	d.CheckErrorNoUsage(err)
    57  	ds := sp.GetDataset()
    58  	ds, err = InitDatabase(ds)
    59  	d.PanicIfError(err)
    60  	db := ds.Database()
    61  
    62  	fmt.Println("Creating msg map")
    63  	kvPairs := []types.Value{}
    64  	for _, msg := range msgs {
    65  		kvPairs = append(kvPairs, types.String(msg.ID()), marshal.MustMarshal(db, msg))
    66  	}
    67  	m := types.NewMap(db, kvPairs...)
    68  
    69  	fmt.Println("Creating index")
    70  	ti := NewTermIndex(db, types.NewMap(db)).Edit()
    71  	for _, msg := range msgs {
    72  		terms := GetTerms(msg)
    73  		ti.InsertAll(terms, types.String(msg.ID()))
    74  	}
    75  	termDocs := ti.Value().TermDocs
    76  
    77  	fmt.Println("Creating users")
    78  	users := topUsers(msgs)
    79  
    80  	fmt.Println("Docs:", termDocs.Len(), "Users:", len(users))
    81  	root := Root{Messages: m, Index: termDocs, Users: users}
    82  	nroot := marshal.MustMarshal(db, root)
    83  	if ds.HasHead() {
    84  		left := ds.HeadValue()
    85  		parent := marshal.MustMarshal(db, Root{
    86  			Index:    types.NewMap(db),
    87  			Messages: types.NewMap(db),
    88  		})
    89  		fmt.Println("Merging data")
    90  		nroot, err = merge.ThreeWay(left, nroot, parent, db, nil, nil)
    91  		fmt.Println("Merging complete")
    92  		d.Chk.NoError(err)
    93  	}
    94  	fmt.Println("Committing data")
    95  	_, err = db.CommitValue(ds, nroot)
    96  	return err
    97  }
    98  
    99  func extractDialog(n *html.Node) {
   100  	if c := characterName(n); c != "" {
   101  		//fmt.Println("Character:", character)
   102  		character = c
   103  		return
   104  	}
   105  	if character != "" && n.Type == html.TextNode {
   106  		//fmt.Println("Dialog:", strings.TrimSpace(n.Data))
   107  		msg := Message{
   108  			Ordinal:    uint64(len(msgs)),
   109  			Author:     character,
   110  			Body:       strings.TrimSpace(n.Data),
   111  			ClientTime: datetime.Now(),
   112  		}
   113  		msgs = append(msgs, msg)
   114  		character = ""
   115  	}
   116  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   117  		extractDialog(c)
   118  	}
   119  }
   120  
   121  func characterName(n *html.Node) string {
   122  	if n.Type != html.ElementNode ||
   123  		n.Data != "b" ||
   124  		n.FirstChild == nil {
   125  		return ""
   126  	}
   127  
   128  	if hasSpaces, _ := regexp.MatchString(`^\s+[^\s]`, n.FirstChild.Data); !hasSpaces {
   129  		return ""
   130  	}
   131  	return strings.TrimSpace(n.FirstChild.Data)
   132  }
   133  
   134  type cpair struct {
   135  	character string
   136  	cnt       int
   137  }
   138  
   139  func topUsers(msgs []Message) []string {
   140  	userpat := regexp.MustCompile(`^[a-zA-Z][a-zA-Z\s]*\d*$`)
   141  	usermap := map[string]int{}
   142  	for _, msg := range msgs {
   143  		name := strings.TrimSpace(msg.Author)
   144  		if userpat.MatchString(name) {
   145  			usermap[name] += 1
   146  		}
   147  	}
   148  	pairs := []cpair{}
   149  	for name, cnt := range usermap {
   150  		if len(name) > 1 && !strings.HasPrefix(name, "ANOTHER") {
   151  			pairs = append(pairs, cpair{character: strings.ToLower(name), cnt: cnt})
   152  		}
   153  	}
   154  	// sort descending by cnt
   155  	sort.Slice(pairs, func(i, j int) bool {
   156  		return pairs[j].cnt < pairs[i].cnt
   157  	})
   158  	users := []string{}
   159  	for i, p := range pairs {
   160  		if i >= 30 {
   161  			break
   162  		}
   163  		users = append(users, p.character)
   164  	}
   165  	sort.Strings(users)
   166  	return users
   167  }