geen xquery meer gebruikt, te traag

This commit is contained in:
Peter Kleiweg
2026-04-10 14:14:35 +02:00
parent 040a923e98
commit 929207d365
5 changed files with 306 additions and 25 deletions

View File

@@ -5,20 +5,13 @@ import (
// "github.com/kr/pretty"
"bufio"
"encoding/xml"
"fmt"
"os"
"regexp"
"sort"
"strings"
)
type Item struct {
XMLName xml.Name `xml:"i"`
Msg string `xml:"m"`
Tags []string `xml:"t"`
Word string `xml:"w"`
}
type Word struct {
word string
sortkey string
@@ -33,8 +26,10 @@ type Tag struct {
}
var (
x = e.ExitErr
words = make(map[string]*Word)
x = e.ExitErr
words = make(map[string]*Word)
reTag = regexp.MustCompile(`tag: "((?:\\.|[^\\"])*)"`)
reUnquote = regexp.MustCompile(`\\.`)
ignore = map[string]bool{
"Algemeen": true,
@@ -48,26 +43,28 @@ func main() {
scanner := bufio.NewScanner(os.Stdin)
for scanner.Scan() {
var item Item
line := scanner.Text()
x(xml.Unmarshal([]byte(line), &item))
w, ok := words[item.Word]
aa := strings.Split(line, "\t")
word := aa[0]
tags := aa[1]
lbl := aa[2]
w, ok := words[word]
if !ok {
w = &Word{
word: item.Word,
sortkey: strings.ToLower(item.Word),
word: word,
sortkey: strings.ToLower(word),
tags: make(map[string]map[string]int),
}
words[item.Word] = w
words[word] = w
}
w.count++
lbl := item.Msg[:strings.Index(item.Msg, ".")]
for _, tag := range item.Tags {
lbl = lbl[:strings.Index(lbl, ".")]
for _, tag := range parseTags(tags) {
if !ignore[tag] {
if _, ok := w.tags[lbl]; !ok {
w.tags[lbl] = make(map[string]int)
}
if tag != item.Word {
if tag != word {
w.tags[lbl][tag] = w.tags[lbl][tag] + 1
}
}
@@ -95,6 +92,15 @@ func main() {
}
func parseTags(s string) []string {
tags := make([]string, 0)
aa := reTag.FindAllStringSubmatch(s, -1)
for _, a := range aa {
tags = append(tags, unquote(a[1]))
}
return tags
}
func getTag(tags map[string]map[string]int) string {
all := make([]Tag, 0)
@@ -155,3 +161,9 @@ func getTag(tags map[string]map[string]int) string {
return strings.Join(aa, ", ")
}
func unquote(text string) string {
return reUnquote.ReplaceAllStringFunc(text, func(s string) string {
return s[1:]
})
}