diff --git a/cmd/items2count/items2count.go b/cmd/items2count/items2count.go index 552565a..30319e6 100644 --- a/cmd/items2count/items2count.go +++ b/cmd/items2count/items2count.go @@ -5,20 +5,13 @@ import ( // "github.com/kr/pretty" "bufio" - "encoding/xml" "fmt" "os" + "regexp" "sort" "strings" ) -type Item struct { - XMLName xml.Name `xml:"i"` - Msg string `xml:"m"` - Tags []string `xml:"t"` - Word string `xml:"w"` -} - type Word struct { word string sortkey string @@ -33,8 +26,10 @@ type Tag struct { } var ( - x = e.ExitErr - words = make(map[string]*Word) + x = e.ExitErr + words = make(map[string]*Word) + reTag = regexp.MustCompile(`tag: "((?:\\.|[^\\"])*)"`) + reUnquote = regexp.MustCompile(`\\.`) ignore = map[string]bool{ "Algemeen": true, @@ -48,26 +43,28 @@ func main() { scanner := bufio.NewScanner(os.Stdin) for scanner.Scan() { - var item Item line := scanner.Text() - x(xml.Unmarshal([]byte(line), &item)) - w, ok := words[item.Word] + aa := strings.Split(line, "\t") + word := aa[0] + tags := aa[1] + lbl := aa[2] + w, ok := words[word] if !ok { w = &Word{ - word: item.Word, - sortkey: strings.ToLower(item.Word), + word: word, + sortkey: strings.ToLower(word), tags: make(map[string]map[string]int), } - words[item.Word] = w + words[word] = w } w.count++ - lbl := item.Msg[:strings.Index(item.Msg, ".")] - for _, tag := range item.Tags { + lbl = lbl[:strings.Index(lbl, ".")] + for _, tag := range parseTags(tags) { if !ignore[tag] { if _, ok := w.tags[lbl]; !ok { w.tags[lbl] = make(map[string]int) } - if tag != item.Word { + if tag != word { w.tags[lbl][tag] = w.tags[lbl][tag] + 1 } } @@ -95,6 +92,15 @@ func main() { } +func parseTags(s string) []string { + tags := make([]string, 0) + aa := reTag.FindAllStringSubmatch(s, -1) + for _, a := range aa { + tags = append(tags, unquote(a[1])) + } + return tags +} + func getTag(tags map[string]map[string]int) string { all := make([]Tag, 0) @@ -155,3 +161,9 @@ func getTag(tags map[string]map[string]int) string { return strings.Join(aa, ", ") } + +func unquote(text string) string { + return reUnquote.ReplaceAllStringFunc(text, func(s string) string { + return s[1:] + }) +} diff --git a/collect.sh b/collect.sh index 4f60929..122d51a 100755 --- a/collect.sh +++ b/collect.sh @@ -48,19 +48,43 @@ do # tellingen met tags - alto tq:../xquery/nieuwe_namen.xq $files | sort | uniq | items2count > $part-nieuwe-namen-$ds-$i + alto \ + 'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \ + 'tt:%w\t%d\t%I' $files \ + | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ + | items2count > $part-nieuwe-namen-$ds-$i top20 $part-nieuwe-namen-$ds-$i - alto tq:../xquery/nieuwe_woorden.xq $files | sort | uniq | items2count > $part-nieuwe-woorden-$ds-$i + alto \ + 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ + 'tt:%w\t%d\t%I' $files \ + | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ + | items2count > $part-nieuwe-woorden-$ds-$i top20 $part-nieuwe-woorden-$ds-$i - alto tq:../xquery/locaties.xq $files | sort | uniq | items2count > $part-locaties-$ds-$i + alto \ + 'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"])]' \ + 'tt:%l\t%d\t%I' $files \ + | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ + | items2count > $part-locaties-$ds-$i - alto tq:../xquery/personen.xq $files | sort | uniq | items2count > $part-personen-$ds-$i + alto \ + 'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"])]' \ + 'tt:%l\t%d\t%I' $files \ + | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ + | items2count > $part-personen-$ds-$i - alto tq:../xquery/organisaties.xq $files | sort | uniq | items2count > $part-organisaties-$ds-$i + alto \ + 'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"])]' \ + 'tt:%l\t%d\t%I' $files \ + | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ + | items2count > $part-organisaties-$ds-$i - alto tq:../xquery/overige_namen.xq $files | sort | uniq | items2count > $part-overige-namen-$ds-$i + alto \ + 'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"])]' \ + 'tt:%l\t%d\t%I' $files \ + | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ + | items2count > $part-overige-namen-$ds-$i # tellingen met postags diff --git a/xquery/README b/xquery/README new file mode 100644 index 0000000..2fae210 --- /dev/null +++ b/xquery/README @@ -0,0 +1,2 @@ +dit wordt niet meer gebruikt +xquery is veel te traag diff --git a/xquery/collect.sh.oud b/xquery/collect.sh.oud new file mode 100644 index 0000000..4f60929 --- /dev/null +++ b/xquery/collect.sh.oud @@ -0,0 +1,86 @@ +#!/bin/bash + +set -e + +unset CDPATH +PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam + +if [ "$1" = "" ] +then + ds=`ISOWeek -7` +else + case "$1" in + 2[0-9][0-9][0-9]-[0-5][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +cd /net/corpora/nlnieuws/data + +declare -A parts +parts[alles]='.' +parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso' +parts[groningen]='GG|Sikkom' +parts[AT5]='AT5' +parts[GG]='GG' +parts[NOS]='NOS' +parts[NU]='NU' +parts[NieuwsNL]='NieuwsNL' +parts[RO]='RO' +parts[Sargasso]='Sargasso' +parts[Sikkom]='Sikkom' +parts[Tzum]='Tzum' +parts[VRT]='VRT' + +for part in ${!parts[@]} +do + regex=${parts[$part]} + + for i in 1 4 + do + files=$(find .. $(week2files $ds $i) | grep -E "$regex") + + # tellingen met tags + + alto tq:../xquery/nieuwe_namen.xq $files | sort | uniq | items2count > $part-nieuwe-namen-$ds-$i + top20 $part-nieuwe-namen-$ds-$i + + alto tq:../xquery/nieuwe_woorden.xq $files | sort | uniq | items2count > $part-nieuwe-woorden-$ds-$i + top20 $part-nieuwe-woorden-$ds-$i + + alto tq:../xquery/locaties.xq $files | sort | uniq | items2count > $part-locaties-$ds-$i + + alto tq:../xquery/personen.xq $files | sort | uniq | items2count > $part-personen-$ds-$i + + alto tq:../xquery/organisaties.xq $files | sort | uniq | items2count > $part-organisaties-$ds-$i + + alto tq:../xquery/overige_namen.xq $files | sort | uniq | items2count > $part-overige-namen-$ds-$i + + # tellingen met postags + + alto \ + 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @ + his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ + 'tt:%w\t%l\t%P\t%I' $files \ + | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ + | sed -e 's/\(.*\)\t.*/\1/' | uniq -c \ + | grep -v '^ *1 ' \ + | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ + > $part-nieuwe-woorden-extra-$ds-$i + + alto \ + 'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \ + 'tt:%w\t%P\t%I' $files \ + | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ + | sed -e 's/\(.*\)\t.*/\1/' | uniq -c \ + | grep -v '^ *1 ' \ + | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ + > $part-nieuwe-adjww-extra-$ds-$i + done +done diff --git a/xquery/items2count.go.oud b/xquery/items2count.go.oud new file mode 100644 index 0000000..552565a --- /dev/null +++ b/xquery/items2count.go.oud @@ -0,0 +1,157 @@ +package main + +import ( + e "codeberg.org/pebbe/errors" + // "github.com/kr/pretty" + + "bufio" + "encoding/xml" + "fmt" + "os" + "sort" + "strings" +) + +type Item struct { + XMLName xml.Name `xml:"i"` + Msg string `xml:"m"` + Tags []string `xml:"t"` + Word string `xml:"w"` +} + +type Word struct { + word string + sortkey string + count int + tags map[string]map[string]int +} + +type Tag struct { + tag string + sortkey string + count int +} + +var ( + x = e.ExitErr + words = make(map[string]*Word) + + ignore = map[string]bool{ + "Algemeen": true, + "Artikelen": true, + "Nieuws": true, + "Recensies": true, + } +) + +func main() { + + scanner := bufio.NewScanner(os.Stdin) + for scanner.Scan() { + var item Item + line := scanner.Text() + x(xml.Unmarshal([]byte(line), &item)) + w, ok := words[item.Word] + if !ok { + w = &Word{ + word: item.Word, + sortkey: strings.ToLower(item.Word), + tags: make(map[string]map[string]int), + } + words[item.Word] = w + } + w.count++ + lbl := item.Msg[:strings.Index(item.Msg, ".")] + for _, tag := range item.Tags { + if !ignore[tag] { + if _, ok := w.tags[lbl]; !ok { + w.tags[lbl] = make(map[string]int) + } + if tag != item.Word { + w.tags[lbl][tag] = w.tags[lbl][tag] + 1 + } + } + } + } + x(scanner.Err()) + + wordlist := make([]*Word, 0, len(words)) + for _, value := range words { + if value.count > 1 { + wordlist = append(wordlist, value) + } + } + + sort.Slice(wordlist, func(a, b int) bool { + if wordlist[a].count != wordlist[b].count { + return wordlist[a].count > wordlist[b].count + } + return wordlist[a].sortkey < wordlist[b].sortkey + }) + + for _, w := range wordlist { + fmt.Printf("%6d\t%s\t%s\n", w.count, w.word, getTag(w.tags)) + } + +} + +func getTag(tags map[string]map[string]int) string { + + all := make([]Tag, 0) + + for _, tagv := range tags { + n := 0 + tt := make([]string, 0) + for key, value := range tagv { + if value > n { + n = value + tt = []string{key} + } else if value == n { + tt = append(tt, key) + } + } + for _, t := range tt { + all = append(all, Tag{tag: t, count: n, sortkey: strings.ToLower(t)}) + } + } + + sort.Slice(all, func(a, b int) bool { + if all[a].count != all[b].count { + return all[a].count > all[b].count + } + if all[a].sortkey != all[b].sortkey { + return all[a].sortkey < all[b].sortkey + } + return all[a].tag < all[b].tag + }) + + needSort := false + for i := 1; i < len(all); i++ { + if all[i-1].sortkey == all[i].sortkey { + all[i-1].count += all[i].count + all = append(all[:i], all[i+1:]...) + i-- + needSort = true + } + } + if needSort { + sort.Slice(all, func(a, b int) bool { + if all[a].count != all[b].count { + return all[a].count > all[b].count + } + if all[a].sortkey != all[b].sortkey { + return all[a].sortkey < all[b].sortkey + } + return all[a].tag < all[b].tag + }) + } + + aa := make([]string, 0, len(all)) + for _, n := range all { + if n.count > 1 { + aa = append(aa, n.tag) + } + } + + return strings.Join(aa, ", ") +}