diff --git a/.gitignore b/.gitignore
index 82e6053..f9b157f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,7 @@ VRT/metadata
VRT/vrt
bin/ISOWeek
bin/flush
+bin/items2count
bin/score
bin/top20
bin/week2files
diff --git a/Makefile b/Makefile
index 8745534..d6b4301 100644
--- a/Makefile
+++ b/Makefile
@@ -10,16 +10,20 @@ all:
make -C Sikkom
make -C Tzum
make -C VRT
- make bin/ISOWeek
make bin/flush
+ make bin/ISOWeek
+ make bin/items2count
make bin/score
make bin/top20
make bin/week2files
+bin/flush: cmd/flush/*.go
+ go build -o $@ $^
+
bin/ISOWeek: cmd/ISOWeek/*.go
go build -o $@ $^
-bin/flush: cmd/flush/*.go
+bin/items2count: cmd/items2count/*.go
go build -o $@ $^
bin/score: cmd/score/*.go
diff --git a/cmd/items2count/items2count.go b/cmd/items2count/items2count.go
new file mode 100644
index 0000000..552565a
--- /dev/null
+++ b/cmd/items2count/items2count.go
@@ -0,0 +1,157 @@
+package main
+
+import (
+ e "codeberg.org/pebbe/errors"
+ // "github.com/kr/pretty"
+
+ "bufio"
+ "encoding/xml"
+ "fmt"
+ "os"
+ "sort"
+ "strings"
+)
+
+type Item struct {
+ XMLName xml.Name `xml:"i"`
+ Msg string `xml:"m"`
+ Tags []string `xml:"t"`
+ Word string `xml:"w"`
+}
+
+type Word struct {
+ word string
+ sortkey string
+ count int
+ tags map[string]map[string]int
+}
+
+type Tag struct {
+ tag string
+ sortkey string
+ count int
+}
+
+var (
+ x = e.ExitErr
+ words = make(map[string]*Word)
+
+ ignore = map[string]bool{
+ "Algemeen": true,
+ "Artikelen": true,
+ "Nieuws": true,
+ "Recensies": true,
+ }
+)
+
+func main() {
+
+ scanner := bufio.NewScanner(os.Stdin)
+ for scanner.Scan() {
+ var item Item
+ line := scanner.Text()
+ x(xml.Unmarshal([]byte(line), &item))
+ w, ok := words[item.Word]
+ if !ok {
+ w = &Word{
+ word: item.Word,
+ sortkey: strings.ToLower(item.Word),
+ tags: make(map[string]map[string]int),
+ }
+ words[item.Word] = w
+ }
+ w.count++
+ lbl := item.Msg[:strings.Index(item.Msg, ".")]
+ for _, tag := range item.Tags {
+ if !ignore[tag] {
+ if _, ok := w.tags[lbl]; !ok {
+ w.tags[lbl] = make(map[string]int)
+ }
+ if tag != item.Word {
+ w.tags[lbl][tag] = w.tags[lbl][tag] + 1
+ }
+ }
+ }
+ }
+ x(scanner.Err())
+
+ wordlist := make([]*Word, 0, len(words))
+ for _, value := range words {
+ if value.count > 1 {
+ wordlist = append(wordlist, value)
+ }
+ }
+
+ sort.Slice(wordlist, func(a, b int) bool {
+ if wordlist[a].count != wordlist[b].count {
+ return wordlist[a].count > wordlist[b].count
+ }
+ return wordlist[a].sortkey < wordlist[b].sortkey
+ })
+
+ for _, w := range wordlist {
+ fmt.Printf("%6d\t%s\t%s\n", w.count, w.word, getTag(w.tags))
+ }
+
+}
+
+func getTag(tags map[string]map[string]int) string {
+
+ all := make([]Tag, 0)
+
+ for _, tagv := range tags {
+ n := 0
+ tt := make([]string, 0)
+ for key, value := range tagv {
+ if value > n {
+ n = value
+ tt = []string{key}
+ } else if value == n {
+ tt = append(tt, key)
+ }
+ }
+ for _, t := range tt {
+ all = append(all, Tag{tag: t, count: n, sortkey: strings.ToLower(t)})
+ }
+ }
+
+ sort.Slice(all, func(a, b int) bool {
+ if all[a].count != all[b].count {
+ return all[a].count > all[b].count
+ }
+ if all[a].sortkey != all[b].sortkey {
+ return all[a].sortkey < all[b].sortkey
+ }
+ return all[a].tag < all[b].tag
+ })
+
+ needSort := false
+ for i := 1; i < len(all); i++ {
+ if all[i-1].sortkey == all[i].sortkey {
+ all[i-1].count += all[i].count
+ all = append(all[:i], all[i+1:]...)
+ i--
+ needSort = true
+ }
+ }
+ if needSort {
+ sort.Slice(all, func(a, b int) bool {
+ if all[a].count != all[b].count {
+ return all[a].count > all[b].count
+ }
+ if all[a].sortkey != all[b].sortkey {
+ return all[a].sortkey < all[b].sortkey
+ }
+ return all[a].tag < all[b].tag
+ })
+ }
+
+ aa := make([]string, 0, len(all))
+ for _, n := range all {
+ if n.count > 1 {
+ aa = append(aa, n.tag)
+ }
+ }
+
+ return strings.Join(aa, ", ")
+}
diff --git a/cmd/top20/top20.go b/cmd/top20/top20.go
index 7b81202..8e6c51e 100644
--- a/cmd/top20/top20.go
+++ b/cmd/top20/top20.go
@@ -34,7 +34,7 @@ func main() {
x(err)
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
- seen[strings.SplitN(scanner.Text(), "\t", 2)[1]] = true
+ seen[strings.Split(scanner.Text(), "\t")[1]] = true
}
x(scanner.Err())
x(fp.Close())
@@ -49,7 +49,7 @@ func main() {
n := 0
for scanner.Scan() && n < 20 {
line := scanner.Text()
- w := strings.SplitN(line, "\t", 2)[1]
+ w := strings.Split(line, "\t")[1]
if seen[w] {
continue
}
diff --git a/collect.sh b/collect.sh
index 223f8f6..4f60929 100755
--- a/collect.sh
+++ b/collect.sh
@@ -23,70 +23,64 @@ fi
cd /net/corpora/nlnieuws/data
-for i in 1 4
+declare -A parts
+parts[alles]='.'
+parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso'
+parts[groningen]='GG|Sikkom'
+parts[AT5]='AT5'
+parts[GG]='GG'
+parts[NOS]='NOS'
+parts[NU]='NU'
+parts[NieuwsNL]='NieuwsNL'
+parts[RO]='RO'
+parts[Sargasso]='Sargasso'
+parts[Sikkom]='Sikkom'
+parts[Tzum]='Tzum'
+parts[VRT]='VRT'
+
+for part in ${!parts[@]}
do
+ regex=${parts[$part]}
- files=$(find .. $(week2files $ds $i))
+ for i in 1 4
+ do
+ files=$(find .. $(week2files $ds $i) | grep -E "$regex")
- alto \
- 'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
- tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-namen-$ds-$i
+ # tellingen met tags
- top20 nieuw-namen-$ds-$i
- # score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score
+ alto tq:../xquery/nieuwe_namen.xq $files | sort | uniq | items2count > $part-nieuwe-namen-$ds-$i
+ top20 $part-nieuwe-namen-$ds-$i
- alto \
- 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
- his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
- tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-woorden-$ds-$i
+ alto tq:../xquery/nieuwe_woorden.xq $files | sort | uniq | items2count > $part-nieuwe-woorden-$ds-$i
+ top20 $part-nieuwe-woorden-$ds-$i
- top20 nieuw-woorden-$ds-$i
- # score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score
+ alto tq:../xquery/locaties.xq $files | sort | uniq | items2count > $part-locaties-$ds-$i
- alto \
- 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
- his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
- 'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-woorden-extra-$ds-$i
+ alto tq:../xquery/personen.xq $files | sort | uniq | items2count > $part-personen-$ds-$i
- top20 nieuw-woorden-extra-$ds-$i
- # score nieuw-woorden-extra-$ds-$i > nieuw-woorden-extra-$ds-$i.score
+ alto tq:../xquery/organisaties.xq $files | sort | uniq | items2count > $part-organisaties-$ds-$i
- alto \
- 'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"])]' \
- tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > locaties-$ds-$i
+ alto tq:../xquery/overige_namen.xq $files | sort | uniq | items2count > $part-overige-namen-$ds-$i
- # score locaties-$ds-$i > locaties-$ds-$i.score
+ # tellingen met postags
- alto \
- 'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"])]' \
- tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > personen-$ds-$i
-
- # score personen-$ds-$i > personen-$ds-$i.score
-
- alto \
- 'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"])]' \
- tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > organisaties-$ds-$i
-
- # score organisaties-$ds-$i > organisaties-$ds-$i.score
-
- alto \
- 'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"])]' \
- tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > overige-namen-$ds-$i
-
- # score overige-namen-$ds-$i > overige-namen-$ds-$i.score
-
- alto \
- 'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
- 'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-adjww-extra-$ds-$i
-
- # score nieuws-adjww-extra-$ds-$i > nieuw-adjww-extra-$ds-$i.score
+ alto \
+ 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
+ his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
+ 'tt:%w\t%l\t%P\t%I' $files \
+ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
+ | sed -e 's/\(.*\)\t.*/\1/' | uniq -c \
+ | grep -v '^ *1 ' \
+ | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
+ > $part-nieuwe-woorden-extra-$ds-$i
+ alto \
+ 'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
+ 'tt:%w\t%P\t%I' $files \
+ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
+ | sed -e 's/\(.*\)\t.*/\1/' | uniq -c \
+ | grep -v '^ *1 ' \
+ | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
+ > $part-nieuwe-adjww-extra-$ds-$i
+ done
done
diff --git a/collect2.sh b/collect2.sh
deleted file mode 100755
index 00c93db..0000000
--- a/collect2.sh
+++ /dev/null
@@ -1,95 +0,0 @@
-#!/bin/bash
-
-set -e
-
-unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
-export TZ=Europe/Amsterdam
-
-if [ "$1" = "" ]
-then
- ds=`ISOWeek -7`
-else
- case "$1" in
- 2[0-9][0-9][0-9]-[0-5][0-9])
- ds=$1
- ;;
- *)
- echo INVALID
- exit 1
- ;;
- esac
-fi
-
-cd /net/corpora/nlnieuws/data
-
-for corpus in AT5 GG NOS NU NieuwsNL RO Sargasso Sikkom Tzum VRT
-do
- for i in 1 4
- do
-
- files=$(find ../$corpus $(week2files $ds $i))
-
- alto \
- 'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
- tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-nieuw-namen-$ds-$i
-
- top20 $corpus-nieuw-namen-$ds-$i
- # score $corpus-nieuw-namen-$ds-$i > $corpus-nieuw-namen-$ds-$i.score
-
- alto \
- 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
- his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
- tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-nieuw-woorden-$ds-$i
-
- top20 $corpus-nieuw-woorden-$ds-$i
- # score $corpus-nieuw-woorden-$ds-$i > $corpus-nieuw-woorden-$ds-$i.score
-
- alto \
- 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
- his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
- 'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-nieuw-woorden-extra-$ds-$i
-
- top20 $corpus-nieuw-woorden-extra-$ds-$i
- # score $corpus-nieuw-woorden-extra-$ds-$i > $corpus-nieuw-woorden-extra-$ds-$i.score
-
- alto \
- 'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"])]' \
- tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-locaties-$ds-$i
-
- # score $corpus-locaties-$ds-$i > $corpus-locaties-$ds-$i.score
-
- alto \
- 'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"])]' \
- tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-personen-$ds-$i
-
- # score $corpus-personen-$ds-$i > $corpus-personen-$ds-$i.score
-
- alto \
- 'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"])]' \
- tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-organisaties-$ds-$i
-
- # score $corpus-organisaties-$ds-$i > $corpus-organisaties-$ds-$i.score
-
- alto \
- 'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"])]' \
- tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-overige-namen-$ds-$i
-
- # score $corpus-overige-namen-$ds-$i > $corpus-overige-namen-$ds-$i.score
-
- alto \
- 'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
- 'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
- sort -f -k 2 | sort -n -r -k 1,1 -s > $corpus-nieuw-adjww-extra-$ds-$i
-
- # score $corpus-nieuws-adjww-extra-$ds-$i > $corpus-nieuw-adjww-extra-$ds-$i.score
-
- done
-done
diff --git a/go.mod b/go.mod
index e583a0e..66cde4c 100644
--- a/go.mod
+++ b/go.mod
@@ -6,7 +6,12 @@ require github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5
require (
codeberg.org/pebbe/errors v0.4.0
+ github.com/kr/pretty v0.3.1
github.com/pebbe/textcat/v2 v2.3.0
)
-require github.com/pebbe/util v0.9.0 // indirect
+require (
+ github.com/kr/text v0.2.0 // indirect
+ github.com/pebbe/util v0.9.0 // indirect
+ github.com/rogpeppe/go-internal v1.9.0 // indirect
+)
diff --git a/go.sum b/go.sum
index 899fa8d..1d09548 100644
--- a/go.sum
+++ b/go.sum
@@ -1,8 +1,16 @@
codeberg.org/pebbe/errors v0.4.0 h1:G05wsXpC/LRPaL02QYDwtz0sWFWQcIWK1s+MC79LBzU=
codeberg.org/pebbe/errors v0.4.0/go.mod h1:O7PPxUJM1bWRHq11CRK3wqVaH/3NnRaSVZvh3UhzDCY=
+github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4=
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/pebbe/textcat/v2 v2.3.0 h1:RB2egIQgI2a2Ls+I9No6KFQKCZBIFt8Cc/SWCnVtC7Y=
github.com/pebbe/textcat/v2 v2.3.0/go.mod h1:WLXWuL+fOlQJqn6LmubjD+e78hCC6Y/rAWInh0wq/kg=
github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA=
github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q=
+github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
+github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
+github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
diff --git a/xquery/howto b/xquery/howto
deleted file mode 100644
index 6dd4a53..0000000
--- a/xquery/howto
+++ /dev/null
@@ -1,12 +0,0 @@
-alto *.data.dz tq:nieuwe_namen.xq | sort | uniq > items.txt
-
-voor elk item dit bijwerken:
-
- type Item struct {
- count int
- tags map[string]int
- }
-
- items := make(map[string]Item)
-
-
diff --git a/xquery/locaties.xq b/xquery/locaties.xq
new file mode 100644
index 0000000..b060c61
--- /dev/null
+++ b/xquery/locaties.xq
@@ -0,0 +1,16 @@
+for $x in //node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"])]
+ return (
+ {replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "")}
+{
+for $i in data(/alpino_ds/metadata/meta[@name="tag"]/@value)
+ return {$i}
+}
+ { data($x//@lemma) }
+, '
' )
+
+(:
+
+{ data(/alpino_ds/sentence/@sentid) }
+{ replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "") }
+
+:)
diff --git a/xquery/nieuwe_namen.xq b/xquery/nieuwe_namen.xq
index cb48f45..86d2d61 100644
--- a/xquery/nieuwe_namen.xq
+++ b/xquery/nieuwe_namen.xq
@@ -2,10 +2,6 @@ for $x in //node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and n
return (
{replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "")}
{
-for $i in data(/alpino_ds/metadata/meta[@name="cat"]/@value)
- return {$i}
-}
-{
for $i in data(/alpino_ds/metadata/meta[@name="tag"]/@value)
return {$i}
}
diff --git a/xquery/nieuwe_woorden.xq b/xquery/nieuwe_woorden.xq
new file mode 100644
index 0000000..ee08a03
--- /dev/null
+++ b/xquery/nieuwe_woorden.xq
@@ -0,0 +1,16 @@
+for $x in //node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]
+ return (
+ {replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "")}
+{
+for $i in data(/alpino_ds/metadata/meta[@name="tag"]/@value)
+ return {$i}
+}
+ { data($x//@word) }
+, '
' )
+
+(:
+
+{ data(/alpino_ds/sentence/@sentid) }
+{ replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "") }
+
+:)
diff --git a/xquery/organisaties.xq b/xquery/organisaties.xq
new file mode 100644
index 0000000..6e11c52
--- /dev/null
+++ b/xquery/organisaties.xq
@@ -0,0 +1,16 @@
+for $x in //node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"])]
+ return (
+ {replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "")}
+{
+for $i in data(/alpino_ds/metadata/meta[@name="tag"]/@value)
+ return {$i}
+}
+ { data($x//@lemma) }
+, '
' )
+
+(:
+
+{ data(/alpino_ds/sentence/@sentid) }
+{ replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "") }
+
+:)
diff --git a/xquery/overige_namen.xq b/xquery/overige_namen.xq
new file mode 100644
index 0000000..8e60477
--- /dev/null
+++ b/xquery/overige_namen.xq
@@ -0,0 +1,16 @@
+for $x in //node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"])]
+ return (
+ {replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "")}
+{
+for $i in data(/alpino_ds/metadata/meta[@name="tag"]/@value)
+ return {$i}
+}
+ { data($x//@lemma) }
+, '
' )
+
+(:
+
+{ data(/alpino_ds/sentence/@sentid) }
+{ replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "") }
+
+:)
diff --git a/xquery/personen.xq b/xquery/personen.xq
new file mode 100644
index 0000000..9b17caf
--- /dev/null
+++ b/xquery/personen.xq
@@ -0,0 +1,16 @@
+for $x in //node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"])]
+ return (
+ {replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "")}
+{
+for $i in data(/alpino_ds/metadata/meta[@name="tag"]/@value)
+ return {$i}
+}
+ { data($x//@lemma) }
+, '
' )
+
+(:
+
+{ data(/alpino_ds/sentence/@sentid) }
+{ replace(data(/alpino_ds/sentence/@sentid), "\.[^.]*$", "") }
+
+:)