collect.sh
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -25,5 +25,7 @@ Tzum/xml2txt
|
||||
VRT/metadata
|
||||
VRT/vrt
|
||||
bin/ISOWeek
|
||||
bin/week2files
|
||||
20??
|
||||
corpus
|
||||
data
|
||||
|
||||
3
Makefile
3
Makefile
@@ -11,8 +11,11 @@ all:
|
||||
make -C Tzum
|
||||
make -C VRT
|
||||
make bin/ISOWeek
|
||||
make bin/week2files
|
||||
|
||||
bin/ISOWeek: cmd/ISOWeek/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
bin/week2files: cmd/week2files/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
|
||||
95
cmd/week2files/week2files.go
Normal file
95
cmd/week2files/week2files.go
Normal file
@@ -0,0 +1,95 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
x = e.ExitErr
|
||||
)
|
||||
|
||||
func main() {
|
||||
aa := strings.Split(os.Args[1], "-")
|
||||
if len(aa) != 2 {
|
||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
|
||||
}
|
||||
|
||||
year, err := strconv.Atoi(aa[0])
|
||||
x(err)
|
||||
week, err := strconv.Atoi(aa[1])
|
||||
x(err)
|
||||
|
||||
if year < 1000 || year > 9999 {
|
||||
x(fmt.Errorf("ongeldig year: %d", year))
|
||||
}
|
||||
if week < 1 || week > 53 {
|
||||
x(fmt.Errorf("ongeldige week: %d", week))
|
||||
}
|
||||
|
||||
// 15 januari van het jaar
|
||||
t := time.Date(year, 1, 15, 12, 0, 0, 0, time.UTC)
|
||||
|
||||
// eerste gok
|
||||
t = t.AddDate(0, 0, 7*week-14)
|
||||
|
||||
// zoek juiste week
|
||||
var y, w int
|
||||
for {
|
||||
y, w = t.ISOWeek()
|
||||
if y < year {
|
||||
t = t.AddDate(0, 12, 0)
|
||||
continue
|
||||
}
|
||||
if y > year {
|
||||
t = t.AddDate(0, -12, 0)
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
for {
|
||||
y, w = t.ISOWeek()
|
||||
if w < week {
|
||||
t = t.AddDate(0, 0, 7)
|
||||
continue
|
||||
}
|
||||
if w > week {
|
||||
t = t.AddDate(0, 0, -7)
|
||||
}
|
||||
break
|
||||
}
|
||||
if y != year {
|
||||
x(fmt.Errorf("ongeldige combinatie van week/jaar: %d/%d", week, year))
|
||||
}
|
||||
|
||||
// zoek begin van de week
|
||||
d := int(t.Weekday())
|
||||
if d == 0 {
|
||||
d = 7
|
||||
}
|
||||
t = t.AddDate(0, 0, 1-d)
|
||||
|
||||
// drie voorgaande weken
|
||||
t2 := t.AddDate(0, 0, -21)
|
||||
for i := range 4 {
|
||||
if i > 0 {
|
||||
fmt.Print(" -or")
|
||||
}
|
||||
y, w := t2.ISOWeek()
|
||||
fmt.Printf(" -name %d-%02d.data.dz", y, w)
|
||||
t2 = t2.AddDate(0, 0, 7)
|
||||
}
|
||||
|
||||
// vanaf begin drie weken geleden t/m eind huidige week
|
||||
t = t.AddDate(0, 0, -21)
|
||||
for range 28 {
|
||||
fmt.Printf(" -or -name %d-%02d-%02d.data.dz", t.Year(), t.Month(), t.Day())
|
||||
t = t.AddDate(0, 0, 1)
|
||||
}
|
||||
|
||||
}
|
||||
60
collect.sh
Executable file
60
collect.sh
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`ISOWeek -7`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
files=$(find /net/corpora/nlnieuws/ $(week2files $ds))
|
||||
|
||||
cd /net/corpora/nlnieuws/data
|
||||
|
||||
alto \
|
||||
'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
|
||||
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-namen-$ds
|
||||
|
||||
alto \
|
||||
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
|
||||
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
|
||||
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-woorden-$ds
|
||||
|
||||
alto \
|
||||
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
|
||||
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
|
||||
'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-extra-$ds
|
||||
|
||||
alto \
|
||||
'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \
|
||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > locaties-$ds
|
||||
|
||||
alto \
|
||||
'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \
|
||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > personen-$ds
|
||||
|
||||
alto \
|
||||
'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \
|
||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > organisatie-$ds
|
||||
|
||||
alto \
|
||||
'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \
|
||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > overigen-$ds
|
||||
|
||||
alto \
|
||||
'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
|
||||
'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > overigen-extra-$ds
|
||||
232
woord-van-de-maand.txt
Normal file
232
woord-van-de-maand.txt
Normal file
@@ -0,0 +1,232 @@
|
||||
Vragen:
|
||||
|
||||
- hoe data range selecteren (bv alles van maart 2026)
|
||||
|
||||
- website met lijstjes top-N (20?)
|
||||
- nieuwe namen
|
||||
- wel of niet onderverdelen naar categorie?
|
||||
- nieuwe woorden
|
||||
- met postag
|
||||
- bestaande namen
|
||||
- personen
|
||||
- plaatsen
|
||||
- organisaties
|
||||
- misc
|
||||
|
||||
- queries worden nog beetje aangepast denk ik
|
||||
|
||||
"nieuw": nu: niet in Alpino, later (ook): niet in top-N van vorige maand.
|
||||
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' tt:%w |sort | uniq -c |sort -nr | head -n 20
|
||||
|
||||
"nieuwe namen"
|
||||
|
||||
445 Straat van Hormuz
|
||||
433 Jetten
|
||||
309 AI
|
||||
301 Høiby
|
||||
250 Odido
|
||||
190 Zelensky
|
||||
174 Rob Jetten
|
||||
153 VRT NWS
|
||||
134 Jeffrey Epstein
|
||||
130 Anthropic
|
||||
125 Schulting
|
||||
115 GroenLinks-PvdA
|
||||
109 TikTok
|
||||
106 Xandra Velzeboer
|
||||
106 Kyiv
|
||||
106 JA21
|
||||
104 Starmer
|
||||
98 Marius Borg Høiby
|
||||
95 Revolutionaire Garde
|
||||
94 Jens van 't Wout
|
||||
|
||||
|
||||
"nieuwe woorden":
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' tt:%w |sort | uniq -c |sort -nr |head -n 20
|
||||
|
||||
150 Trump-regering
|
||||
141 coalitieakkoord
|
||||
126 zeestraat
|
||||
122 Golfregio
|
||||
107 massastart
|
||||
96 Amerikaans-Israëlische
|
||||
92 ballistische
|
||||
90 datalek
|
||||
85 kabinet-Jetten
|
||||
82 lng
|
||||
74 droneaanval
|
||||
68 vergeldingsaanvallen
|
||||
61 tussenronde
|
||||
59 Iranoorlog
|
||||
58 vrijgave
|
||||
56 speelzand
|
||||
55 regering-Trump
|
||||
54 sprintrace
|
||||
54 ploegenachtervolging
|
||||
|
||||
liever met postag en lemma erbij:
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' tt:"%w \t %l \t %P" |sort | uniq -c |sort -nr |head -n 20
|
||||
|
||||
150 Trump-regering Trump_regering N(soort,ev,basis,zijd,stan)
|
||||
141 coalitieakkoord coalitie_akkoord N(soort,ev,basis,onz,stan)
|
||||
126 zeestraat zee_straat N(soort,ev,basis,zijd,stan)
|
||||
121 Golfregio Golf_regio N(soort,ev,basis,zijd,stan)
|
||||
107 massastart massa_start N(soort,ev,basis,zijd,stan)
|
||||
96 Amerikaans-Israëlische Amerikaans_Israëlisch ADJ(prenom,basis,met-e,stan)
|
||||
90 datalek data_lek N(soort,ev,basis,onz,stan)
|
||||
90 ballistische ballistisch ADJ(prenom,basis,met-e,stan)
|
||||
82 lng lng N(soort,ev,basis,onz,stan)
|
||||
74 droneaanval drone_aanval N(soort,ev,basis,zijd,stan)
|
||||
72 kabinet-Jetten kabinet-Jetten N(soort,ev,basis,onz,stan)
|
||||
66 vergeldingsaanvallen vergelding_aanval N(soort,mv,basis)
|
||||
61 tussenronde tussen_ronde N(soort,ev,basis,zijd,stan)
|
||||
59 Iranoorlog Iran_oorlog N(soort,ev,basis,zijd,stan)
|
||||
56 speelzand speel_zand N(soort,ev,basis,onz,stan)
|
||||
55 regering-Trump regering_Trump N(soort,ev,basis,zijd,stan)
|
||||
54 vrijgave vrij_gave N(soort,ev,basis,zijd,stan)
|
||||
54 sprintrace sprint_race N(soort,ev,basis,zijd,stan)
|
||||
54 ploegenachtervolging ploeg_achtervolging N(soort,ev,basis,zijd,stan)
|
||||
53 staatsmedia staat_medium N(soort,mv,basis)
|
||||
|
||||
"bestaande locaties":
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
|
||||
|
||||
|
||||
3910 Iran
|
||||
2180 Nederland
|
||||
1929 VS
|
||||
1610 Israël
|
||||
1218 Midden-Oosten
|
||||
1128 Oekraïne
|
||||
942 Verenigde Staten
|
||||
874 Rusland
|
||||
823 Amsterdam
|
||||
776 Europa
|
||||
668 DEN HAAG
|
||||
563 België
|
||||
555 China
|
||||
445 Milaan
|
||||
429 Frankrijk
|
||||
389 Duitsland
|
||||
380 Brussel
|
||||
374 Dubai
|
||||
368 Libanon
|
||||
364 Groningen
|
||||
|
||||
"bestaande personen":
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
|
||||
|
||||
1812 Trump
|
||||
531 Donald Trump
|
||||
327 Khamenei
|
||||
309 Epstein
|
||||
267 Verstappen
|
||||
229 Andrew
|
||||
208 Máxima
|
||||
187 Ali Khamenei
|
||||
161 Orbán
|
||||
146 Trumps
|
||||
133 Mette-Marit
|
||||
133 Keijzer
|
||||
126 Willem-Alexander
|
||||
126 Kok
|
||||
122 Charles
|
||||
118 Stolz
|
||||
113 Harald
|
||||
111 Poetin
|
||||
97 Van Persie
|
||||
94 Wilders
|
||||
|
||||
|
||||
|
||||
"bestaande organisaties":
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
|
||||
|
||||
|
||||
|
||||
2575 ANP
|
||||
547 Ajax
|
||||
449 Instagram
|
||||
421 EU
|
||||
357 Defensie
|
||||
349 Feyenoord
|
||||
348 D66
|
||||
346 VVD
|
||||
329 PSV
|
||||
305 Hezbollah
|
||||
303 Tweede Kamer
|
||||
303 NEC
|
||||
296 AZ
|
||||
265 CDA
|
||||
263 OM
|
||||
237 NU.nl
|
||||
232 NOS
|
||||
231 BBC
|
||||
224 Kamer
|
||||
219 Openbaar Ministerie
|
||||
|
||||
|
||||
"bestaande andere namen (boeken, films, events, .. )":
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
|
||||
|
||||
|
||||
361 Spelen
|
||||
289 Olympische Spelen
|
||||
278 Eredivisie
|
||||
244 X
|
||||
222 Winterspelen
|
||||
177 Champions League
|
||||
147 Formule 1
|
||||
143 Premier League
|
||||
137 X.
|
||||
112 Oscars
|
||||
102 Grand Prix
|
||||
100 Paralympische Spelen
|
||||
90 Facebook
|
||||
78 Eurovisie Songfestival
|
||||
76 WhatsApp
|
||||
75 Parijs-Nice
|
||||
70 Tweede Wereldoorlog
|
||||
67 Oscar
|
||||
66 The New York Times
|
||||
62 AEX-index
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/* deze misschien niet? */
|
||||
"nieuwe adjectieven, deelwoorden en werkwoorden":
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' tt:"%w %P" |sort | uniq -c |sort -nr |head -n 20
|
||||
|
||||
96 Amerikaans-Israëlische ADJ(prenom,basis,met-e,stan)
|
||||
90 ballistische ADJ(prenom,basis,met-e,stan)
|
||||
41 radicaal-rechtse ADJ(prenom,basis,met-e,stan)
|
||||
29 Israëlisch-Amerikaanse ADJ(prenom,basis,met-e,stan)
|
||||
27 pro-Iraanse ADJ(prenom,basis,met-e,stan)
|
||||
25 Belarussische ADJ(prenom,basis,met-e,stan)
|
||||
22 radicaal-linkse ADJ(prenom,basis,met-e,stan)
|
||||
21 Omaanse ADJ(prenom,basis,met-e,stan)
|
||||
19 pro-Palestijnse ADJ(prenom,basis,met-e,stan)
|
||||
16 partijloze ADJ(prenom,basis,met-e,stan)
|
||||
15 Eindhovense ADJ(prenom,basis,met-e,stan)
|
||||
14 cybercriminele ADJ(prenom,basis,met-e,stan)
|
||||
14 bestverkochte WW(vd,prenom,met-e)
|
||||
12 onbevestigde WW(vd,prenom,met-e)
|
||||
12 kindgebonden WW(vd,prenom,zonder)
|
||||
12 AI-gegenereerde WW(vd,prenom,met-e)
|
||||
11 toekomstbestendig ADJ(vrij,basis,zonder)
|
||||
11 omhooggegaan WW(vd,vrij,zonder)
|
||||
11 Iraans-Koerdische ADJ(prenom,basis,met-e,stan)
|
||||
11 antifascistische ADJ(prenom,basis,met-e,stan)
|
||||
Reference in New Issue
Block a user