diff --git a/.gitignore b/.gitignore index 82e6053..f9b157f 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ VRT/metadata VRT/vrt bin/ISOWeek bin/flush +bin/items2count bin/score bin/top20 bin/week2files diff --git a/Makefile b/Makefile index 8745534..d6b4301 100644 --- a/Makefile +++ b/Makefile @@ -10,16 +10,20 @@ all: make -C Sikkom make -C Tzum make -C VRT - make bin/ISOWeek make bin/flush + make bin/ISOWeek + make bin/items2count make bin/score make bin/top20 make bin/week2files +bin/flush: cmd/flush/*.go + go build -o $@ $^ + bin/ISOWeek: cmd/ISOWeek/*.go go build -o $@ $^ -bin/flush: cmd/flush/*.go +bin/items2count: cmd/items2count/*.go go build -o $@ $^ bin/score: cmd/score/*.go diff --git a/cmd/items2count/items2count.go b/cmd/items2count/items2count.go new file mode 100644 index 0000000..552565a --- /dev/null +++ b/cmd/items2count/items2count.go @@ -0,0 +1,157 @@ +package main + +import ( + e "codeberg.org/pebbe/errors" + // "github.com/kr/pretty" + + "bufio" + "encoding/xml" + "fmt" + "os" + "sort" + "strings" +) + +type Item struct { + XMLName xml.Name `xml:"i"` + Msg string `xml:"m"` + Tags []string `xml:"t"` + Word string `xml:"w"` +} + +type Word struct { + word string + sortkey string + count int + tags map[string]map[string]int +} + +type Tag struct { + tag string + sortkey string + count int +} + +var ( + x = e.ExitErr + words = make(map[string]*Word) + + ignore = map[string]bool{ + "Algemeen": true, + "Artikelen": true, + "Nieuws": true, + "Recensies": true, + } +) + +func main() { + + scanner := bufio.NewScanner(os.Stdin) + for scanner.Scan() { + var item Item + line := scanner.Text() + x(xml.Unmarshal([]byte(line), &item)) + w, ok := words[item.Word] + if !ok { + w = &Word{ + word: item.Word, + sortkey: strings.ToLower(item.Word), + tags: make(map[string]map[string]int), + } + words[item.Word] = w + } + w.count++ + lbl := item.Msg[:strings.Index(item.Msg, ".")] + for _, tag := range item.Tags { + if !ignore[tag] { + if _, ok := w.tags[lbl]; !ok { + w.tags[lbl] = make(map[string]int) + } + if tag != item.Word { + w.tags[lbl][tag] = w.tags[lbl][tag] + 1 + } + } + } + } + x(scanner.Err()) + + wordlist := make([]*Word, 0, len(words)) + for _, value := range words { + if value.count > 1 { + wordlist = append(wordlist, value) + } + } + + sort.Slice(wordlist, func(a, b int) bool { + if wordlist[a].count != wordlist[b].count { + return wordlist[a].count > wordlist[b].count + } + return wordlist[a].sortkey < wordlist[b].sortkey + }) + + for _, w := range wordlist { + fmt.Printf("%6d\t%s\t%s\n", w.count, w.word, getTag(w.tags)) + } + +} + +func getTag(tags map[string]map[string]int) string { + + all := make([]Tag, 0) + + for _, tagv := range tags { + n := 0 + tt := make([]string, 0) + for key, value := range tagv { + if value > n { + n = value + tt = []string{key} + } else if value == n { + tt = append(tt, key) + } + } + for _, t := range tt { + all = append(all, Tag{tag: t, count: n, sortkey: strings.ToLower(t)}) + } + } + + sort.Slice(all, func(a, b int) bool { + if all[a].count != all[b].count { + return all[a].count > all[b].count + } + if all[a].sortkey != all[b].sortkey { + return all[a].sortkey < all[b].sortkey + } + return all[a].tag < all[b].tag + }) + + needSort := false + for i := 1; i < len(all); i++ { + if all[i-1].sortkey == all[i].sortkey { + all[i-1].count += all[i].count + all = append(all[:i], all[i+1:]...) + i-- + needSort = true + } + } + if needSort { + sort.Slice(all, func(a, b int) bool { + if all[a].count != all[b].count { + return all[a].count > all[b].count + } + if all[a].sortkey != all[b].sortkey { + return all[a].sortkey < all[b].sortkey + } + return all[a].tag < all[b].tag + }) + } + + aa := make([]string, 0, len(all)) + for _, n := range all { + if n.count > 1 { + aa = append(aa, n.tag) + } + } + + return strings.Join(aa, ", ") +} diff --git a/cmd/top20/top20.go b/cmd/top20/top20.go index 7b81202..8e6c51e 100644 --- a/cmd/top20/top20.go +++ b/cmd/top20/top20.go @@ -34,7 +34,7 @@ func main() { x(err) scanner := bufio.NewScanner(fp) for scanner.Scan() { - seen[strings.SplitN(scanner.Text(), "\t", 2)[1]] = true + seen[strings.Split(scanner.Text(), "\t")[1]] = true } x(scanner.Err()) x(fp.Close()) @@ -49,7 +49,7 @@ func main() { n := 0 for scanner.Scan() && n < 20 { line := scanner.Text() - w := strings.SplitN(line, "\t", 2)[1] + w := strings.Split(line, "\t")[1] if seen[w] { continue } diff --git a/collect.sh b/collect.sh index 223f8f6..4f60929 100755 --- a/collect.sh +++ b/collect.sh @@ -23,70 +23,64 @@ fi cd /net/corpora/nlnieuws/data -for i in 1 4 +declare -A parts +parts[alles]='.' +parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso' +parts[groningen]='GG|Sikkom' +parts[AT5]='AT5' +parts[GG]='GG' +parts[NOS]='NOS' +parts[NU]='NU' +parts[NieuwsNL]='NieuwsNL' +parts[RO]='RO' +parts[Sargasso]='Sargasso' +parts[Sikkom]='Sikkom' +parts[Tzum]='Tzum' +parts[VRT]='VRT' + +for part in ${!parts[@]} do + regex=${parts[$part]} - files=$(find .. $(week2files $ds $i)) + for i in 1 4 + do + files=$(find .. $(week2files $ds $i) | grep -E "$regex") - alto \ - 'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \ - tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-namen-$ds-$i + # tellingen met tags - top20 nieuw-namen-$ds-$i - # score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score + alto tq:../xquery/nieuwe_namen.xq $files | sort | uniq | items2count > $part-nieuwe-namen-$ds-$i + top20 $part-nieuwe-namen-$ds-$i - alto \ - 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @ - his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ - tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-woorden-$ds-$i + alto tq:../xquery/nieuwe_woorden.xq $files | sort | uniq | items2count > $part-nieuwe-woorden-$ds-$i + top20 $part-nieuwe-woorden-$ds-$i - top20 nieuw-woorden-$ds-$i - # score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score + alto tq:../xquery/locaties.xq $files | sort | uniq | items2count > $part-locaties-$ds-$i - alto \ - 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @ - his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ - 'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-woorden-extra-$ds-$i + alto tq:../xquery/personen.xq $files | sort | uniq | items2count > $part-personen-$ds-$i - top20 nieuw-woorden-extra-$ds-$i - # score nieuw-woorden-extra-$ds-$i > nieuw-woorden-extra-$ds-$i.score + alto tq:../xquery/organisaties.xq $files | sort | uniq | items2count > $part-organisaties-$ds-$i - alto \ - 'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"])]' \ - tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > locaties-$ds-$i + alto tq:../xquery/overige_namen.xq $files | sort | uniq | items2count > $part-overige-namen-$ds-$i - # score locaties-$ds-$i > locaties-$ds-$i.score + # tellingen met postags - alto \ - 'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"])]' \ - tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > personen-$ds-$i - - # score personen-$ds-$i > personen-$ds-$i.score - - alto \ - 'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"])]' \ - tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > organisaties-$ds-$i - - # score organisaties-$ds-$i > organisaties-$ds-$i.score - - alto \ - 'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"])]' \ - tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > overige-namen-$ds-$i - - # score overige-namen-$ds-$i > overige-namen-$ds-$i.score - - alto \ - 'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \ - 'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-adjww-extra-$ds-$i - - # score nieuws-adjww-extra-$ds-$i > nieuw-adjww-extra-$ds-$i.score + alto \ + 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @ + his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ + 'tt:%w\t%l\t%P\t%I' $files \ + | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ + | sed -e 's/\(.*\)\t.*/\1/' | uniq -c \ + | grep -v '^ *1 ' \ + | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ + > $part-nieuwe-woorden-extra-$ds-$i + alto \ + 'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \ + 'tt:%w\t%P\t%I' $files \ + | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ + | sed -e 's/\(.*\)\t.*/\1/' | uniq -c \ + | grep -v '^ *1 ' \ + | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ + > $part-nieuwe-adjww-extra-$ds-$i + done done diff --git a/collect2.sh b/collect2.sh deleted file mode 100755 index 00c93db..0000000 --- a/collect2.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash - -set -e - -unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH -export TZ=Europe/Amsterdam - -if [ "$1" = "" ] -then - ds=`ISOWeek -7` -else - case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) - ds=$1 - ;; - *) - echo INVALID - exit 1 - ;; - esac -fi - -cd /net/corpora/nlnieuws/data - -for corpus in AT5 GG NOS NU NieuwsNL RO Sargasso Sikkom Tzum VRT -do - for i in 1 4 - do - - files=$(find ../$corpus $(week2files $ds $i)) - - alto \ - 'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \ - tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-nieuw-namen-$ds-$i - - top20 $corpus-nieuw-namen-$ds-$i - # score $corpus-nieuw-namen-$ds-$i > $corpus-nieuw-namen-$ds-$i.score - - alto \ - 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @ - his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ - tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-nieuw-woorden-$ds-$i - - top20 $corpus-nieuw-woorden-$ds-$i - # score $corpus-nieuw-woorden-$ds-$i > $corpus-nieuw-woorden-$ds-$i.score - - alto \ - 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @ - his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ - 'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-nieuw-woorden-extra-$ds-$i - - top20 $corpus-nieuw-woorden-extra-$ds-$i - # score $corpus-nieuw-woorden-extra-$ds-$i > $corpus-nieuw-woorden-extra-$ds-$i.score - - alto \ - 'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"])]' \ - tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-locaties-$ds-$i - - # score $corpus-locaties-$ds-$i > $corpus-locaties-$ds-$i.score - - alto \ - 'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"])]' \ - tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-personen-$ds-$i - - # score $corpus-personen-$ds-$i > $corpus-personen-$ds-$i.score - - alto \ - 'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"])]' \ - tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-organisaties-$ds-$i - - # score $corpus-organisaties-$ds-$i > $corpus-organisaties-$ds-$i.score - - alto \ - 'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"])]' \ - tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-overige-namen-$ds-$i - - # score $corpus-overige-namen-$ds-$i > $corpus-overige-namen-$ds-$i.score - - alto \ - 'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \ - 'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ - sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-nieuw-adjww-extra-$ds-$i - - # score $corpus-nieuws-adjww-extra-$ds-$i > $corpus-nieuw-adjww-extra-$ds-$i.score - - done -done diff --git a/go.mod b/go.mod index e583a0e..66cde4c 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,12 @@ require github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 require ( codeberg.org/pebbe/errors v0.4.0 + github.com/kr/pretty v0.3.1 github.com/pebbe/textcat/v2 v2.3.0 ) -require github.com/pebbe/util v0.9.0 // indirect +require ( + github.com/kr/text v0.2.0 // indirect + github.com/pebbe/util v0.9.0 // indirect + github.com/rogpeppe/go-internal v1.9.0 // indirect +) diff --git a/go.sum b/go.sum index 899fa8d..1d09548 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,16 @@ codeberg.org/pebbe/errors v0.4.0 h1:G05wsXpC/LRPaL02QYDwtz0sWFWQcIWK1s+MC79LBzU= codeberg.org/pebbe/errors v0.4.0/go.mod h1:O7PPxUJM1bWRHq11CRK3wqVaH/3NnRaSVZvh3UhzDCY= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4= github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/pebbe/textcat/v2 v2.3.0 h1:RB2egIQgI2a2Ls+I9No6KFQKCZBIFt8Cc/SWCnVtC7Y= github.com/pebbe/textcat/v2 v2.3.0/go.mod h1:WLXWuL+fOlQJqn6LmubjD+e78hCC6Y/rAWInh0wq/kg= github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA= github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= +github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= diff --git a/xquery/howto b/xquery/howto deleted file mode 100644 index 6dd4a53..0000000 --- a/xquery/howto +++ /dev/null @@ -1,12 +0,0 @@ -alto *.data.dz tq:nieuwe_namen.xq | sort | uniq > items.txt - -voor elk item dit bijwerken: - - type Item struct { - count int - tags map[string]int - } - - items := make(map[string]Item) - - diff --git a/xquery/locaties.xq b/xquery/locaties.xq new file mode 100644 index 0000000..b060c61 --- /dev/null +++ b/xquery/locaties.xq @@ -0,0 +1,16 @@ +for $x in //node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"])] + return ( + {replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "")} +{ +for $i in data(/alpino_ds/metadata/meta[@name="tag"]/@value) + return {$i} +} + { data($x//@lemma) } +, ' ' ) + +(: + +{ data(/alpino_ds/sentence/@sentid) } +{ replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "") } + +:) diff --git a/xquery/nieuwe_namen.xq b/xquery/nieuwe_namen.xq index cb48f45..86d2d61 100644 --- a/xquery/nieuwe_namen.xq +++ b/xquery/nieuwe_namen.xq @@ -2,10 +2,6 @@ for $x in //node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and n return ( {replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "")} { -for $i in data(/alpino_ds/metadata/meta[@name="cat"]/@value) - return {$i} -} -{ for $i in data(/alpino_ds/metadata/meta[@name="tag"]/@value) return {$i} } diff --git a/xquery/nieuwe_woorden.xq b/xquery/nieuwe_woorden.xq new file mode 100644 index 0000000..ee08a03 --- /dev/null +++ b/xquery/nieuwe_woorden.xq @@ -0,0 +1,16 @@ +for $x in //node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")] + return ( + {replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "")} +{ +for $i in data(/alpino_ds/metadata/meta[@name="tag"]/@value) + return {$i} +} + { data($x//@word) } +, ' ' ) + +(: + +{ data(/alpino_ds/sentence/@sentid) } +{ replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "") } + +:) diff --git a/xquery/organisaties.xq b/xquery/organisaties.xq new file mode 100644 index 0000000..6e11c52 --- /dev/null +++ b/xquery/organisaties.xq @@ -0,0 +1,16 @@ +for $x in //node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"])] + return ( + {replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "")} +{ +for $i in data(/alpino_ds/metadata/meta[@name="tag"]/@value) + return {$i} +} + { data($x//@lemma) } +, ' ' ) + +(: + +{ data(/alpino_ds/sentence/@sentid) } +{ replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "") } + +:) diff --git a/xquery/overige_namen.xq b/xquery/overige_namen.xq new file mode 100644 index 0000000..8e60477 --- /dev/null +++ b/xquery/overige_namen.xq @@ -0,0 +1,16 @@ +for $x in //node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"])] + return ( + {replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "")} +{ +for $i in data(/alpino_ds/metadata/meta[@name="tag"]/@value) + return {$i} +} + { data($x//@lemma) } +, ' ' ) + +(: + +{ data(/alpino_ds/sentence/@sentid) } +{ replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "") } + +:) diff --git a/xquery/personen.xq b/xquery/personen.xq new file mode 100644 index 0000000..9b17caf --- /dev/null +++ b/xquery/personen.xq @@ -0,0 +1,16 @@ +for $x in //node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"])] + return ( + {replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "")} +{ +for $i in data(/alpino_ds/metadata/meta[@name="tag"]/@value) + return {$i} +} + { data($x//@lemma) } +, ' ' ) + +(: + +{ data(/alpino_ds/sentence/@sentid) } +{ replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "") } + +:)