github.com/attic-labs/noms@v0.0.0-20210827224422-e5fa29d95e8b/samples/go/decent/lib/importer.go (about) 1 // See: https://github.com/attic-labs/noms/issues/3808 2 // +build ignore 3 4 // Copyright 2017 Attic Labs, Inc. All rights reserved. 5 // Licensed under the Apache License, version 2.0: 6 // http://www.apache.org/licenses/LICENSE-2.0 7 8 package lib 9 10 import ( 11 "errors" 12 "fmt" 13 "os" 14 "path/filepath" 15 "regexp" 16 "sort" 17 "strings" 18 19 "github.com/attic-labs/noms/go/d" 20 "github.com/attic-labs/noms/go/marshal" 21 "github.com/attic-labs/noms/go/merge" 22 "github.com/attic-labs/noms/go/spec" 23 "github.com/attic-labs/noms/go/types" 24 "github.com/attic-labs/noms/go/util/datetime" 25 "golang.org/x/net/html" 26 ) 27 28 var ( 29 character = "" 30 msgs = []Message{} 31 ) 32 33 func RunImport(dir, dsSpec string) error { 34 filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 35 if path == dir { 36 return nil 37 } 38 if !strings.HasSuffix(info.Name(), ".html") { 39 return nil 40 } 41 fmt.Println("importing:", path) 42 f, err := os.Open(path) 43 d.Chk.NoError(err) 44 n, err := html.Parse(f) 45 d.Chk.NoError(err) 46 extractDialog(n) 47 return nil 48 }) 49 50 if len(msgs) == 0 { 51 return errors.New("Failed to import any data") 52 } 53 fmt.Println("Imported", len(msgs), "messages") 54 55 sp, err := spec.ForDataset(dsSpec) 56 d.CheckErrorNoUsage(err) 57 ds := sp.GetDataset() 58 ds, err = InitDatabase(ds) 59 d.PanicIfError(err) 60 db := ds.Database() 61 62 fmt.Println("Creating msg map") 63 kvPairs := []types.Value{} 64 for _, msg := range msgs { 65 kvPairs = append(kvPairs, types.String(msg.ID()), marshal.MustMarshal(db, msg)) 66 } 67 m := types.NewMap(db, kvPairs...) 68 69 fmt.Println("Creating index") 70 ti := NewTermIndex(db, types.NewMap(db)).Edit() 71 for _, msg := range msgs { 72 terms := GetTerms(msg) 73 ti.InsertAll(terms, types.String(msg.ID())) 74 } 75 termDocs := ti.Value().TermDocs 76 77 fmt.Println("Creating users") 78 users := topUsers(msgs) 79 80 fmt.Println("Docs:", termDocs.Len(), "Users:", len(users)) 81 root := Root{Messages: m, Index: termDocs, Users: users} 82 nroot := marshal.MustMarshal(db, root) 83 if ds.HasHead() { 84 left := ds.HeadValue() 85 parent := marshal.MustMarshal(db, Root{ 86 Index: types.NewMap(db), 87 Messages: types.NewMap(db), 88 }) 89 fmt.Println("Merging data") 90 nroot, err = merge.ThreeWay(left, nroot, parent, db, nil, nil) 91 fmt.Println("Merging complete") 92 d.Chk.NoError(err) 93 } 94 fmt.Println("Committing data") 95 _, err = db.CommitValue(ds, nroot) 96 return err 97 } 98 99 func extractDialog(n *html.Node) { 100 if c := characterName(n); c != "" { 101 //fmt.Println("Character:", character) 102 character = c 103 return 104 } 105 if character != "" && n.Type == html.TextNode { 106 //fmt.Println("Dialog:", strings.TrimSpace(n.Data)) 107 msg := Message{ 108 Ordinal: uint64(len(msgs)), 109 Author: character, 110 Body: strings.TrimSpace(n.Data), 111 ClientTime: datetime.Now(), 112 } 113 msgs = append(msgs, msg) 114 character = "" 115 } 116 for c := n.FirstChild; c != nil; c = c.NextSibling { 117 extractDialog(c) 118 } 119 } 120 121 func characterName(n *html.Node) string { 122 if n.Type != html.ElementNode || 123 n.Data != "b" || 124 n.FirstChild == nil { 125 return "" 126 } 127 128 if hasSpaces, _ := regexp.MatchString(`^\s+[^\s]`, n.FirstChild.Data); !hasSpaces { 129 return "" 130 } 131 return strings.TrimSpace(n.FirstChild.Data) 132 } 133 134 type cpair struct { 135 character string 136 cnt int 137 } 138 139 func topUsers(msgs []Message) []string { 140 userpat := regexp.MustCompile(`^[a-zA-Z][a-zA-Z\s]*\d*$`) 141 usermap := map[string]int{} 142 for _, msg := range msgs { 143 name := strings.TrimSpace(msg.Author) 144 if userpat.MatchString(name) { 145 usermap[name] += 1 146 } 147 } 148 pairs := []cpair{} 149 for name, cnt := range usermap { 150 if len(name) > 1 && !strings.HasPrefix(name, "ANOTHER") { 151 pairs = append(pairs, cpair{character: strings.ToLower(name), cnt: cnt}) 152 } 153 } 154 // sort descending by cnt 155 sort.Slice(pairs, func(i, j int) bool { 156 return pairs[j].cnt < pairs[i].cnt 157 }) 158 users := []string{} 159 for i, p := range pairs { 160 if i >= 30 { 161 break 162 } 163 users = append(users, p.character) 164 } 165 sort.Strings(users) 166 return users 167 }