package main import ( e "codeberg.org/pebbe/errors" // "github.com/kr/pretty" "bufio" "fmt" "os" "regexp" "sort" "strings" ) type Word struct { word string sortkey string count int tags map[string]map[string]int } type Tag struct { tag string sortkey string count int } var ( x = e.ExitErr words = make(map[string]*Word) reTag = regexp.MustCompile(`tag: "((?:\\.|[^\\"])*)"`) reUnquote = regexp.MustCompile(`\\.`) ignore = map[string]bool{ "Algemeen": true, "Artikelen": true, "Nieuws": true, "Recensies": true, } ) func main() { scanner := bufio.NewScanner(os.Stdin) for scanner.Scan() { line := scanner.Text() aa := strings.Split(line, "\t") word := aa[0] tags := aa[1] lbl := aa[2] if n := len(aa); n > 3 { lbl = aa[n-1] for i := 2; i < n-1; i++ { word += "\t" + aa[i] } } w, ok := words[word] if !ok { w = &Word{ word: word, sortkey: strings.ToLower(word), tags: make(map[string]map[string]int), } words[word] = w } w.count++ lbl = lbl[:strings.Index(lbl, ".")] for _, tag := range parseTags(tags) { if !ignore[tag] { if _, ok := w.tags[lbl]; !ok { w.tags[lbl] = make(map[string]int) } if tag != word { w.tags[lbl][tag] = w.tags[lbl][tag] + 1 } } } } x(scanner.Err()) wordlist := make([]*Word, 0, len(words)) for _, value := range words { if value.count > 1 { wordlist = append(wordlist, value) } } sort.Slice(wordlist, func(a, b int) bool { if wordlist[a].count != wordlist[b].count { return wordlist[a].count > wordlist[b].count } return wordlist[a].sortkey < wordlist[b].sortkey }) for _, w := range wordlist { var tail string i := strings.Index(w.word, "\t") if i > 0 { tail = w.word[i:] w.word = w.word[:i] } fmt.Printf("%6d\t%s\t%s%s\n", w.count, w.word, getTag(w.tags), tail) } } func parseTags(s string) []string { tags := make([]string, 0) aa := reTag.FindAllStringSubmatch(s, -1) for _, a := range aa { tags = append(tags, unquote(a[1])) } return tags } func getTag(tags map[string]map[string]int) string { all := make([]Tag, 0) for _, tagv := range tags { n := 0 tt := make([]string, 0) for key, value := range tagv { if value > n { n = value tt = []string{key} } else if value == n { tt = append(tt, key) } } for _, t := range tt { all = append(all, Tag{tag: t, count: n, sortkey: strings.ToLower(t)}) } } sort.Slice(all, func(a, b int) bool { if all[a].count != all[b].count { return all[a].count > all[b].count } if all[a].sortkey != all[b].sortkey { return all[a].sortkey < all[b].sortkey } return all[a].tag < all[b].tag }) needSort := false for i := 1; i < len(all); i++ { if all[i-1].sortkey == all[i].sortkey { all[i-1].count += all[i].count all = append(all[:i], all[i+1:]...) i-- needSort = true } } if needSort { sort.Slice(all, func(a, b int) bool { if all[a].count != all[b].count { return all[a].count > all[b].count } if all[a].sortkey != all[b].sortkey { return all[a].sortkey < all[b].sortkey } return all[a].tag < all[b].tag }) } aa := make([]string, 0, len(all)) for _, n := range all { if n.count > 1 { aa = append(aa, n.tag) } } return strings.Join(aa, ", ") } func unquote(text string) string { return reUnquote.ReplaceAllStringFunc(text, func(s string) string { return s[1:] }) }