From 1f4a0846240345844d1624834ed4473d68549607 Mon Sep 17 00:00:00 2001 From: Peter Kleiweg Date: Sat, 6 Jun 2026 17:10:38 +0200 Subject: [PATCH] gone, trends --- .gitignore | 2 + Makefile | 8 ++++ cmd/gone/gone.go | 41 +++++++++++++++++++++ cmd/rang/rang.go | 8 +--- cmd/trends/trends.go | 88 ++++++++++++++++++++++++++++++++++++++++++++ collect.sh | 7 ++-- 6 files changed, 145 insertions(+), 9 deletions(-) create mode 100644 cmd/gone/gone.go create mode 100644 cmd/trends/trends.go diff --git a/.gitignore b/.gitignore index 61ba594..289cd1a 100644 --- a/.gitignore +++ b/.gitignore @@ -45,9 +45,11 @@ VRT/vrt bin/data2json bin/dates2json bin/flush +bin/gone bin/items2count bin/rang bin/top20 +bin/trends bin/week2files 20?? corpus diff --git a/Makefile b/Makefile index ff35e3e..3cb08c4 100644 --- a/Makefile +++ b/Makefile @@ -21,9 +21,11 @@ all: make bin/data2json make bin/dates2json make bin/flush + make bin/gone make bin/items2count make bin/rang make bin/top20 + make bin/trends make bin/week2files bin/data2json: cmd/data2json/*.go @@ -35,6 +37,9 @@ bin/dates2json: cmd/dates2json/*.go bin/flush: cmd/flush/*.go go build -o $@ $^ +bin/gone: cmd/gone/*.go + go build -o $@ $^ + bin/items2count: cmd/items2count/*.go go build -o $@ $^ @@ -44,6 +49,9 @@ bin/rang: cmd/rang/*.go bin/top20: cmd/top20/*.go go build -o $@ $^ +bin/trends: cmd/trends/*.go + go build -o $@ $^ + bin/week2files: cmd/week2files/*.go go build -o $@ $^ diff --git a/cmd/gone/gone.go b/cmd/gone/gone.go new file mode 100644 index 0000000..aabc727 --- /dev/null +++ b/cmd/gone/gone.go @@ -0,0 +1,41 @@ +package main + +import ( + e "codeberg.org/pebbe/errors" + + "bufio" + "fmt" + "os" + "strings" +) + +var ( + x = e.ExitErr +) + +func main() { + current := make(map[string]bool) + fp, err := os.Open(os.Args[2]) + x(err) + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + current[strings.Split(scanner.Text(), "\t")[1]] = true + } + x(scanner.Err()) + fp.Close() + + var last string + fp, err = os.Open(os.Args[1]) + x(err) + scanner = bufio.NewScanner(fp) + for scanner.Scan() { + aa := strings.Split(scanner.Text(), "\t") + if !current[aa[1]] { + fmt.Printf("%s\t%s\n", aa[0], aa[1]) + } + last = aa[0] + } + x(scanner.Err()) + fp.Close() + fmt.Printf("%s\t\n", last) +} diff --git a/cmd/rang/rang.go b/cmd/rang/rang.go index daa3368..c07cd8e 100644 --- a/cmd/rang/rang.go +++ b/cmd/rang/rang.go @@ -1,5 +1,7 @@ package main +// alto 'fp://node[....]' 'tt:%w\t%I' $files | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang + import ( e "codeberg.org/pebbe/errors" @@ -59,9 +61,3 @@ func main() { } } - -/* - -alto 'fp://node[@pt="n"]' 'tt:%w\t%I' $files | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang - -*/ diff --git a/cmd/trends/trends.go b/cmd/trends/trends.go new file mode 100644 index 0000000..26bba5a --- /dev/null +++ b/cmd/trends/trends.go @@ -0,0 +1,88 @@ +package main + +import ( + e "codeberg.org/pebbe/errors" + + "bufio" + "fmt" + "os" + "sort" + "strconv" + "strings" +) + +type Item struct { + word string + diff float64 +} + +var ( + x = e.ExitErr +) + +func main() { + + refs := make(map[string]int) + refmax := 0 + fp, err := os.Open(os.Args[1]) + x(err) + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + aa := strings.Split(scanner.Text(), "\t") + n, err := strconv.Atoi(aa[0]) + x(err) + refs[aa[1]] = n + if n > refmax { + refmax = n + } + } + x(scanner.Err()) + fp.Close() + refmax++ + + lines := make([]string, 0) + fp, err = os.Open(os.Args[2]) + x(err) + scanner = bufio.NewScanner(fp) + for scanner.Scan() { + lines = append(lines, scanner.Text()) + } + x(scanner.Err()) + fp.Close() + + curmax, err := strconv.Atoi(strings.Split(lines[len(lines)-1], "\t")[0]) + x(err) + curmax++ + + items := make([]Item, 0) + + for _, line := range lines { + aa := strings.Split(line, "\t") + n, err := strconv.Atoi(aa[0]) + x(err) + m, ok := refs[aa[1]] + if !ok { + //continue + m = refmax + } + diff := float64(m)/float64(refmax) - float64(n)/float64(curmax) + if diff > 0.05 || diff < -0.05 { + items = append(items, Item{ + word: aa[1], + diff: diff, + }) + } + } + + sort.Slice(items, func(a, b int) bool { + if items[a].diff == items[b].diff { + return items[a].word < items[b].word + } + return items[a].diff > items[b].diff + }) + + for _, item := range items { + fmt.Printf("%f\t%s\n", item.diff, item.word) + } + +} diff --git a/collect.sh b/collect.sh index ac25118..ac5acbd 100755 --- a/collect.sh +++ b/collect.sh @@ -165,11 +165,12 @@ do # ranglijsten - say $part-rang-noun=$ds=$i + say $part-rang-$ds-$i alto \ - 'fp://node[@pt="n"]' 'tt:%w\t%I' $files \ + 'fp://node[((@pt="n" or @neclass) and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass])]' \ + 'tt:%w\t%I' $files \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \ - > $part-rang-noun=$ds=$i + > $part-rang-$ds-$i done done