collect.sh
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -25,5 +25,7 @@ Tzum/xml2txt
|
|||||||
VRT/metadata
|
VRT/metadata
|
||||||
VRT/vrt
|
VRT/vrt
|
||||||
bin/ISOWeek
|
bin/ISOWeek
|
||||||
|
bin/week2files
|
||||||
20??
|
20??
|
||||||
corpus
|
corpus
|
||||||
|
data
|
||||||
|
|||||||
3
Makefile
3
Makefile
@@ -11,8 +11,11 @@ all:
|
|||||||
make -C Tzum
|
make -C Tzum
|
||||||
make -C VRT
|
make -C VRT
|
||||||
make bin/ISOWeek
|
make bin/ISOWeek
|
||||||
|
make bin/week2files
|
||||||
|
|
||||||
bin/ISOWeek: cmd/ISOWeek/*.go
|
bin/ISOWeek: cmd/ISOWeek/*.go
|
||||||
go build -o $@ $^
|
go build -o $@ $^
|
||||||
|
|
||||||
|
bin/week2files: cmd/week2files/*.go
|
||||||
|
go build -o $@ $^
|
||||||
|
|
||||||
|
|||||||
95
cmd/week2files/week2files.go
Normal file
95
cmd/week2files/week2files.go
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
e "codeberg.org/pebbe/errors"
|
||||||
|
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
x = e.ExitErr
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
aa := strings.Split(os.Args[1], "-")
|
||||||
|
if len(aa) != 2 {
|
||||||
|
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
|
||||||
|
}
|
||||||
|
|
||||||
|
year, err := strconv.Atoi(aa[0])
|
||||||
|
x(err)
|
||||||
|
week, err := strconv.Atoi(aa[1])
|
||||||
|
x(err)
|
||||||
|
|
||||||
|
if year < 1000 || year > 9999 {
|
||||||
|
x(fmt.Errorf("ongeldig year: %d", year))
|
||||||
|
}
|
||||||
|
if week < 1 || week > 53 {
|
||||||
|
x(fmt.Errorf("ongeldige week: %d", week))
|
||||||
|
}
|
||||||
|
|
||||||
|
// 15 januari van het jaar
|
||||||
|
t := time.Date(year, 1, 15, 12, 0, 0, 0, time.UTC)
|
||||||
|
|
||||||
|
// eerste gok
|
||||||
|
t = t.AddDate(0, 0, 7*week-14)
|
||||||
|
|
||||||
|
// zoek juiste week
|
||||||
|
var y, w int
|
||||||
|
for {
|
||||||
|
y, w = t.ISOWeek()
|
||||||
|
if y < year {
|
||||||
|
t = t.AddDate(0, 12, 0)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if y > year {
|
||||||
|
t = t.AddDate(0, -12, 0)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
y, w = t.ISOWeek()
|
||||||
|
if w < week {
|
||||||
|
t = t.AddDate(0, 0, 7)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if w > week {
|
||||||
|
t = t.AddDate(0, 0, -7)
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if y != year {
|
||||||
|
x(fmt.Errorf("ongeldige combinatie van week/jaar: %d/%d", week, year))
|
||||||
|
}
|
||||||
|
|
||||||
|
// zoek begin van de week
|
||||||
|
d := int(t.Weekday())
|
||||||
|
if d == 0 {
|
||||||
|
d = 7
|
||||||
|
}
|
||||||
|
t = t.AddDate(0, 0, 1-d)
|
||||||
|
|
||||||
|
// drie voorgaande weken
|
||||||
|
t2 := t.AddDate(0, 0, -21)
|
||||||
|
for i := range 4 {
|
||||||
|
if i > 0 {
|
||||||
|
fmt.Print(" -or")
|
||||||
|
}
|
||||||
|
y, w := t2.ISOWeek()
|
||||||
|
fmt.Printf(" -name %d-%02d.data.dz", y, w)
|
||||||
|
t2 = t2.AddDate(0, 0, 7)
|
||||||
|
}
|
||||||
|
|
||||||
|
// vanaf begin drie weken geleden t/m eind huidige week
|
||||||
|
t = t.AddDate(0, 0, -21)
|
||||||
|
for range 28 {
|
||||||
|
fmt.Printf(" -or -name %d-%02d-%02d.data.dz", t.Year(), t.Month(), t.Day())
|
||||||
|
t = t.AddDate(0, 0, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
60
collect.sh
Executable file
60
collect.sh
Executable file
@@ -0,0 +1,60 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
unset CDPATH
|
||||||
|
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||||
|
export TZ=Europe/Amsterdam
|
||||||
|
|
||||||
|
if [ "$1" = "" ]
|
||||||
|
then
|
||||||
|
ds=`ISOWeek -7`
|
||||||
|
else
|
||||||
|
case "$1" in
|
||||||
|
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||||
|
ds=$1
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo INVALID
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
|
||||||
|
files=$(find /net/corpora/nlnieuws/ $(week2files $ds))
|
||||||
|
|
||||||
|
cd /net/corpora/nlnieuws/data
|
||||||
|
|
||||||
|
alto \
|
||||||
|
'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
|
||||||
|
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-namen-$ds
|
||||||
|
|
||||||
|
alto \
|
||||||
|
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
|
||||||
|
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
|
||||||
|
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-woorden-$ds
|
||||||
|
|
||||||
|
alto \
|
||||||
|
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
|
||||||
|
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
|
||||||
|
'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-extra-$ds
|
||||||
|
|
||||||
|
alto \
|
||||||
|
'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \
|
||||||
|
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > locaties-$ds
|
||||||
|
|
||||||
|
alto \
|
||||||
|
'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \
|
||||||
|
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > personen-$ds
|
||||||
|
|
||||||
|
alto \
|
||||||
|
'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \
|
||||||
|
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > organisatie-$ds
|
||||||
|
|
||||||
|
alto \
|
||||||
|
'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \
|
||||||
|
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > overigen-$ds
|
||||||
|
|
||||||
|
alto \
|
||||||
|
'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
|
||||||
|
'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > overigen-extra-$ds
|
||||||
232
woord-van-de-maand.txt
Normal file
232
woord-van-de-maand.txt
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
Vragen:
|
||||||
|
|
||||||
|
- hoe data range selecteren (bv alles van maart 2026)
|
||||||
|
|
||||||
|
- website met lijstjes top-N (20?)
|
||||||
|
- nieuwe namen
|
||||||
|
- wel of niet onderverdelen naar categorie?
|
||||||
|
- nieuwe woorden
|
||||||
|
- met postag
|
||||||
|
- bestaande namen
|
||||||
|
- personen
|
||||||
|
- plaatsen
|
||||||
|
- organisaties
|
||||||
|
- misc
|
||||||
|
|
||||||
|
- queries worden nog beetje aangepast denk ik
|
||||||
|
|
||||||
|
"nieuw": nu: niet in Alpino, later (ook): niet in top-N van vorige maand.
|
||||||
|
|
||||||
|
|
||||||
|
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' tt:%w |sort | uniq -c |sort -nr | head -n 20
|
||||||
|
|
||||||
|
"nieuwe namen"
|
||||||
|
|
||||||
|
445 Straat van Hormuz
|
||||||
|
433 Jetten
|
||||||
|
309 AI
|
||||||
|
301 Høiby
|
||||||
|
250 Odido
|
||||||
|
190 Zelensky
|
||||||
|
174 Rob Jetten
|
||||||
|
153 VRT NWS
|
||||||
|
134 Jeffrey Epstein
|
||||||
|
130 Anthropic
|
||||||
|
125 Schulting
|
||||||
|
115 GroenLinks-PvdA
|
||||||
|
109 TikTok
|
||||||
|
106 Xandra Velzeboer
|
||||||
|
106 Kyiv
|
||||||
|
106 JA21
|
||||||
|
104 Starmer
|
||||||
|
98 Marius Borg Høiby
|
||||||
|
95 Revolutionaire Garde
|
||||||
|
94 Jens van 't Wout
|
||||||
|
|
||||||
|
|
||||||
|
"nieuwe woorden":
|
||||||
|
|
||||||
|
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' tt:%w |sort | uniq -c |sort -nr |head -n 20
|
||||||
|
|
||||||
|
150 Trump-regering
|
||||||
|
141 coalitieakkoord
|
||||||
|
126 zeestraat
|
||||||
|
122 Golfregio
|
||||||
|
107 massastart
|
||||||
|
96 Amerikaans-Israëlische
|
||||||
|
92 ballistische
|
||||||
|
90 datalek
|
||||||
|
85 kabinet-Jetten
|
||||||
|
82 lng
|
||||||
|
74 droneaanval
|
||||||
|
68 vergeldingsaanvallen
|
||||||
|
61 tussenronde
|
||||||
|
59 Iranoorlog
|
||||||
|
58 vrijgave
|
||||||
|
56 speelzand
|
||||||
|
55 regering-Trump
|
||||||
|
54 sprintrace
|
||||||
|
54 ploegenachtervolging
|
||||||
|
|
||||||
|
liever met postag en lemma erbij:
|
||||||
|
|
||||||
|
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' tt:"%w \t %l \t %P" |sort | uniq -c |sort -nr |head -n 20
|
||||||
|
|
||||||
|
150 Trump-regering Trump_regering N(soort,ev,basis,zijd,stan)
|
||||||
|
141 coalitieakkoord coalitie_akkoord N(soort,ev,basis,onz,stan)
|
||||||
|
126 zeestraat zee_straat N(soort,ev,basis,zijd,stan)
|
||||||
|
121 Golfregio Golf_regio N(soort,ev,basis,zijd,stan)
|
||||||
|
107 massastart massa_start N(soort,ev,basis,zijd,stan)
|
||||||
|
96 Amerikaans-Israëlische Amerikaans_Israëlisch ADJ(prenom,basis,met-e,stan)
|
||||||
|
90 datalek data_lek N(soort,ev,basis,onz,stan)
|
||||||
|
90 ballistische ballistisch ADJ(prenom,basis,met-e,stan)
|
||||||
|
82 lng lng N(soort,ev,basis,onz,stan)
|
||||||
|
74 droneaanval drone_aanval N(soort,ev,basis,zijd,stan)
|
||||||
|
72 kabinet-Jetten kabinet-Jetten N(soort,ev,basis,onz,stan)
|
||||||
|
66 vergeldingsaanvallen vergelding_aanval N(soort,mv,basis)
|
||||||
|
61 tussenronde tussen_ronde N(soort,ev,basis,zijd,stan)
|
||||||
|
59 Iranoorlog Iran_oorlog N(soort,ev,basis,zijd,stan)
|
||||||
|
56 speelzand speel_zand N(soort,ev,basis,onz,stan)
|
||||||
|
55 regering-Trump regering_Trump N(soort,ev,basis,zijd,stan)
|
||||||
|
54 vrijgave vrij_gave N(soort,ev,basis,zijd,stan)
|
||||||
|
54 sprintrace sprint_race N(soort,ev,basis,zijd,stan)
|
||||||
|
54 ploegenachtervolging ploeg_achtervolging N(soort,ev,basis,zijd,stan)
|
||||||
|
53 staatsmedia staat_medium N(soort,mv,basis)
|
||||||
|
|
||||||
|
"bestaande locaties":
|
||||||
|
|
||||||
|
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
|
||||||
|
|
||||||
|
|
||||||
|
3910 Iran
|
||||||
|
2180 Nederland
|
||||||
|
1929 VS
|
||||||
|
1610 Israël
|
||||||
|
1218 Midden-Oosten
|
||||||
|
1128 Oekraïne
|
||||||
|
942 Verenigde Staten
|
||||||
|
874 Rusland
|
||||||
|
823 Amsterdam
|
||||||
|
776 Europa
|
||||||
|
668 DEN HAAG
|
||||||
|
563 België
|
||||||
|
555 China
|
||||||
|
445 Milaan
|
||||||
|
429 Frankrijk
|
||||||
|
389 Duitsland
|
||||||
|
380 Brussel
|
||||||
|
374 Dubai
|
||||||
|
368 Libanon
|
||||||
|
364 Groningen
|
||||||
|
|
||||||
|
"bestaande personen":
|
||||||
|
|
||||||
|
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
|
||||||
|
|
||||||
|
1812 Trump
|
||||||
|
531 Donald Trump
|
||||||
|
327 Khamenei
|
||||||
|
309 Epstein
|
||||||
|
267 Verstappen
|
||||||
|
229 Andrew
|
||||||
|
208 Máxima
|
||||||
|
187 Ali Khamenei
|
||||||
|
161 Orbán
|
||||||
|
146 Trumps
|
||||||
|
133 Mette-Marit
|
||||||
|
133 Keijzer
|
||||||
|
126 Willem-Alexander
|
||||||
|
126 Kok
|
||||||
|
122 Charles
|
||||||
|
118 Stolz
|
||||||
|
113 Harald
|
||||||
|
111 Poetin
|
||||||
|
97 Van Persie
|
||||||
|
94 Wilders
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"bestaande organisaties":
|
||||||
|
|
||||||
|
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
2575 ANP
|
||||||
|
547 Ajax
|
||||||
|
449 Instagram
|
||||||
|
421 EU
|
||||||
|
357 Defensie
|
||||||
|
349 Feyenoord
|
||||||
|
348 D66
|
||||||
|
346 VVD
|
||||||
|
329 PSV
|
||||||
|
305 Hezbollah
|
||||||
|
303 Tweede Kamer
|
||||||
|
303 NEC
|
||||||
|
296 AZ
|
||||||
|
265 CDA
|
||||||
|
263 OM
|
||||||
|
237 NU.nl
|
||||||
|
232 NOS
|
||||||
|
231 BBC
|
||||||
|
224 Kamer
|
||||||
|
219 Openbaar Ministerie
|
||||||
|
|
||||||
|
|
||||||
|
"bestaande andere namen (boeken, films, events, .. )":
|
||||||
|
|
||||||
|
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
|
||||||
|
|
||||||
|
|
||||||
|
361 Spelen
|
||||||
|
289 Olympische Spelen
|
||||||
|
278 Eredivisie
|
||||||
|
244 X
|
||||||
|
222 Winterspelen
|
||||||
|
177 Champions League
|
||||||
|
147 Formule 1
|
||||||
|
143 Premier League
|
||||||
|
137 X.
|
||||||
|
112 Oscars
|
||||||
|
102 Grand Prix
|
||||||
|
100 Paralympische Spelen
|
||||||
|
90 Facebook
|
||||||
|
78 Eurovisie Songfestival
|
||||||
|
76 WhatsApp
|
||||||
|
75 Parijs-Nice
|
||||||
|
70 Tweede Wereldoorlog
|
||||||
|
67 Oscar
|
||||||
|
66 The New York Times
|
||||||
|
62 AEX-index
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* deze misschien niet? */
|
||||||
|
"nieuwe adjectieven, deelwoorden en werkwoorden":
|
||||||
|
|
||||||
|
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' tt:"%w %P" |sort | uniq -c |sort -nr |head -n 20
|
||||||
|
|
||||||
|
96 Amerikaans-Israëlische ADJ(prenom,basis,met-e,stan)
|
||||||
|
90 ballistische ADJ(prenom,basis,met-e,stan)
|
||||||
|
41 radicaal-rechtse ADJ(prenom,basis,met-e,stan)
|
||||||
|
29 Israëlisch-Amerikaanse ADJ(prenom,basis,met-e,stan)
|
||||||
|
27 pro-Iraanse ADJ(prenom,basis,met-e,stan)
|
||||||
|
25 Belarussische ADJ(prenom,basis,met-e,stan)
|
||||||
|
22 radicaal-linkse ADJ(prenom,basis,met-e,stan)
|
||||||
|
21 Omaanse ADJ(prenom,basis,met-e,stan)
|
||||||
|
19 pro-Palestijnse ADJ(prenom,basis,met-e,stan)
|
||||||
|
16 partijloze ADJ(prenom,basis,met-e,stan)
|
||||||
|
15 Eindhovense ADJ(prenom,basis,met-e,stan)
|
||||||
|
14 cybercriminele ADJ(prenom,basis,met-e,stan)
|
||||||
|
14 bestverkochte WW(vd,prenom,met-e)
|
||||||
|
12 onbevestigde WW(vd,prenom,met-e)
|
||||||
|
12 kindgebonden WW(vd,prenom,zonder)
|
||||||
|
12 AI-gegenereerde WW(vd,prenom,met-e)
|
||||||
|
11 toekomstbestendig ADJ(vrij,basis,zonder)
|
||||||
|
11 omhooggegaan WW(vd,vrij,zonder)
|
||||||
|
11 Iraans-Koerdische ADJ(prenom,basis,met-e,stan)
|
||||||
|
11 antifascistische ADJ(prenom,basis,met-e,stan)
|
||||||
Reference in New Issue
Block a user