Compare commits

...

24 Commits

Author SHA1 Message Date
Peter Kleiweg
c2389c65af stijgers2json.py 2026-06-19 17:53:03 +02:00
Peter Kleiweg
802a84a929 collect.sh: stijgers 2026-06-19 13:41:28 +02:00
Peter Kleiweg
8289072a52 weekadd 2026-06-19 12:59:45 +02:00
Peter Kleiweg
e0550197e3 meer stijgers 2026-06-18 14:49:00 +02:00
Peter Kleiweg
01e6d48665 stijgers 2026-06-18 12:52:40 +02:00
Peter Kleiweg
a8bea0ab44 updates 2026-06-10 13:31:55 +02:00
Peter Kleiweg
d7adc17d4b Volkskrant: columns hebben geen intro 2026-06-09 18:37:19 +02:00
Peter Kleiweg
a9f9e17acf gone gone 2026-06-06 21:05:00 +02:00
Peter Kleiweg
1f4a084624 gone, trends 2026-06-06 17:10:38 +02:00
Peter Kleiweg
9f29222909 ranglijsten 2026-06-06 14:46:01 +02:00
Peter Kleiweg
a76fa21584 update 2026-06-05 16:05:46 +02:00
Peter Kleiweg
efa301cc4a Tzum: <em>Titel</em> -> "Titel" 2026-06-03 17:32:01 +02:00
Peter Kleiweg
14590570ba fix voor zinnen aan elkaar 2026-05-29 17:22:10 +02:00
Peter Kleiweg
ca4e7af8fa tags; .De -> . De 2026-05-29 12:22:57 +02:00
Peter Kleiweg
66581d4e98 data2json.go: aangepast aan grote reorganisatie 2026-05-28 02:59:46 +02:00
Peter Kleiweg
e53049e62f top20.go: aangepast aan grote reorganisatie 2026-05-28 02:36:55 +02:00
Peter Kleiweg
7f23212fc3 app: checkmark bij datumkiezer 2026-05-27 23:19:30 +02:00
Peter Kleiweg
5c651387af grote reorganisatie:
- HLN, NOS, NU, VRT: per week -> per dag
- yyyy-ww -> yyyy.ww
- yyyy*  -> yyyy/yyyy*
etc
2026-05-27 22:42:03 +02:00
Peter Kleiweg
e430ff576b update readme 2026-05-25 12:37:55 +02:00
Peter Kleiweg
9d82f11536 README 2026-05-24 19:13:09 +02:00
Peter Kleiweg
650a13eb4a weg: cmd/score (2) 2026-05-24 16:44:26 +02:00
Peter Kleiweg
bf0407b933 weg: cmd/score 2026-05-24 16:42:12 +02:00
Peter Kleiweg
fcad105a75 helperfuncties naar internal/util (2) 2026-05-24 16:29:56 +02:00
Peter Kleiweg
75832c3132 fix title voor HLN en Parool; helperfuncties naar internal/util 2026-05-24 16:16:21 +02:00
109 changed files with 26801 additions and 1409 deletions

6
.gitignore vendored
View File

@@ -38,15 +38,19 @@ Sikkom/sikkom
Tzum/metadata
Tzum/tzum
Tzum/xml2txt
Volkskrant/metadata
Volkskrant/volkskrant
VRT/metadata
VRT/vrt
bin/data2json
bin/dates2json
bin/flush
bin/items2count
bin/score
bin/rang
bin/top20
bin/trends
bin/week2files
bin/weekadd
20??
corpus
data

View File

@@ -3,11 +3,11 @@ all: \
metadata \
at5
xml2txt: cmd/xml2txt/*.go
go build -o $@ $^
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
at5: cmd/at5/*.go
go build -o $@ $^
at5: cmd/at5/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,13 +3,14 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -46,7 +47,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/AT5/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -76,7 +77,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -110,16 +111,3 @@ func main() {
}()
}
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"os"
@@ -20,7 +22,7 @@ type Item struct {
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
@@ -29,16 +31,16 @@ func main() {
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/AT5/" + dp))
x(os.MkdirAll("out", 0777))
@@ -55,39 +57,15 @@ func main() {
x(err)
var item Item
x(xml.Unmarshal(b, &item), filename)
x(fp.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body/p | //body/h2`)
x(err)
for _, p := range pp {
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/AT5
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/AT5/corpus/$ds
cd /net/corpora/nlnieuws/AT5/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out
mkdir out
../../xml2txt $ds
xml2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,7 +58,7 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
buurtadam: cmd/buurtadam/*.go
go build -o $@ $^
buurtadam: cmd/buurtadam/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,13 +4,14 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -64,7 +65,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/BuurtAdam/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -94,7 +95,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -158,6 +159,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)
@@ -202,7 +205,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
}
for _, div := range divs {
p(fp.WriteString(addEnd(fixSpace(div.Content()))))
p(fp.WriteString(u.AddEnd(u.FixSpace(div.Content()))))
}
p(fp.Close())
@@ -211,40 +214,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/BuurtAdam
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/BuurtAdam/corpus/$ds
cd /net/corpora/nlnieuws/BuurtAdam/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +56,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
buurtgrn: cmd/buurtgrn/*.go
go build -o $@ $^
buurtgrn: cmd/buurtgrn/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -6,11 +6,11 @@ import (
"encoding/xml"
"fmt"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -64,7 +64,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/BuurtGrn/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -94,7 +94,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -158,6 +158,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)
@@ -202,7 +204,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
}
for _, div := range divs {
p(fp.WriteString(addEnd(fixSpace(div.Content()))))
p(fp.WriteString(u.AddEnd(u.FixSpace(div.Content()))))
}
p(fp.Close())
@@ -211,40 +213,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/BuurtGrn
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/BuurtGrn/corpus/$ds
cd /net/corpora/nlnieuws/BuurtGrn/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +56,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
gg: cmd/gg/*.go
go build -o $@ $^
gg: cmd/gg/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,13 +4,14 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -64,7 +65,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/GG/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -94,7 +95,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -154,6 +155,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)
@@ -191,10 +194,10 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
fp, err := os.Create(filename + ".txt")
p(err)
p(fp.WriteString(addEnd(fixSpace(title))))
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
for _, el := range ell {
p(fp.WriteString(addEnd(fixSpace(el.Content()))))
p(fp.WriteString(u.AddEnd(u.FixSpace(el.Content()))))
}
p(fp.Close())
@@ -203,40 +206,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/GG
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/GG/corpus/$ds
cd /net/corpora/nlnieuws/GG/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,7 +56,7 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
hln: cmd/hln/*.go
go build -o $@ $^
hln: cmd/hln/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,13 +4,16 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
// "encoding/json"
"encoding/xml"
"fmt"
// "html"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -25,9 +28,16 @@ type ItemT struct {
UnixTime int64 `xml:"unixTime"`
Guid string `xml:"guid"`
Link string `xml:"link"`
Title string `xml:"title"`
Data []byte `xml:",innerxml"`
}
/*
type GraphT struct {
Graph []map[string]any `json:"@graph"`
}
*/
var (
p = e.PanicErr
w = e.WarnErr
@@ -62,7 +72,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/HLN/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -91,8 +101,7 @@ func main() {
t, err = time.Parse(time.RFC1123, item.PubDate)
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
if exists(dirname + "/lock") {
continue
}
@@ -127,18 +136,19 @@ func main() {
p(fp.WriteString("</item>\n"))
p(fp.Close())
p(os.Chtimes(filename+".xml", t, t))
ok = doArticle(filename, item.Link, t, needUpdate)
ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
}()
}
}
func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) (ok bool) {
func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
if exists(filename + ".skip") {
return true
}
if needUpdate {
_ = os.Remove(filename + ".err")
_ = os.Remove(filename + ".html")
// _ = os.Remove(filename + ".json")
_ = os.Remove(filename + ".txt")
} else {
if exists(filename + ".txt") {
@@ -158,6 +168,62 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
/*
s := string(body)
ok = true
i1 := strings.Index(s, `type="application/ld+json"`)
if i1 < 0 {
ok = false
} else {
i1 += strings.Index(s[i1:], `>`) + 1
i2 := i1 + strings.Index(s[i1:], `</script>`)
if i2 < i1 {
ok = false
} else {
s = html.UnescapeString(s[i1:i2])
}
}
if !ok {
_ = w(fmt.Errorf("script jsonld not found: %s", url))
fp, err := os.Create(filename + ".err")
p(err)
p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
p(fp.Close())
p(os.Chtimes(filename+".err", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return false
}
var graph GraphT
p(json.Unmarshal([]byte(s), &graph))
for _, g := range graph.Graph {
t := g["@type"]
switch v := t.(type) {
case string:
if v == "NewsArticle" {
b, err := json.Marshal(g)
p(err)
s = string(b)
}
}
}
fp, err := os.Create(filename + ".json")
p(err)
p(fp.WriteString(s))
p(fp.Close())
p(os.Chtimes(filename+".json", timestamp, timestamp))
*/
doc, err := gokogiri.ParseHtml(body)
p(err)
@@ -196,18 +262,6 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
pars := make([]string, 0)
ell, err = article.Search(`.//*[@data-content-type="TITLE"]`)
p(err)
if len(ell) != 1 {
_ = w(fmt.Errorf("found %d titles: %s", len(ell), url))
}
for _, el := range ell {
s := strings.TrimSpace(el.Content())
if s != "" {
pars = append(pars, s)
}
}
hasIntro := false
ell, err = article.Search(`.//*[@data-content-type="INTRO"]`)
p(err)
@@ -224,8 +278,8 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
specials, err := article.Search(`.//*[@data-content-type="GROUP"]`)
p(err)
for _, special := range specials {
special.Remove()
for i := len(specials) - 1; i >= 0; i-- {
specials[i].Remove()
}
other, err := article.Search(`.//*[@data-content-type="PODCAST"]`)
@@ -285,12 +339,14 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(fmt.Fprintln(fp, "##META text tag ="))
} else {
for _, tag := range tags {
p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
}
}
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
for _, par := range pars {
p(fp.WriteString(addEnd(fixSpace(par))))
p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
}
p(fp.Close())
@@ -299,43 +355,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
if strings.HasSuffix(s, `.”`) || strings.HasSuffix(s, `!”`) || strings.HasSuffix(s, `?”`) {
return s + "\n"
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/HLN
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -2days +%Y-%m-%d`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1
;;
*)
@@ -23,10 +26,11 @@ else
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/HLN/corpus/$ds
cd /net/corpora/nlnieuws/HLN/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +55,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -3,11 +3,11 @@ all: \
metadata \
litnl
xml2txt: cmd/xml2txt/*.go
go build -o $@ $^
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
litnl: cmd/litnl/*.go
go build -o $@ $^
litnl: cmd/litnl/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,13 +3,14 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -46,7 +47,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/LitNL/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -76,7 +77,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -108,16 +109,3 @@ func main() {
}
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"os"
@@ -22,7 +24,7 @@ var (
w = e.WarnErr
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
@@ -31,16 +33,16 @@ func main() {
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/LitNL/" + dp))
x(os.MkdirAll("out", 0777))
@@ -58,10 +60,10 @@ func main() {
var item Item
x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
}
x(fp.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body//p`)
@@ -74,32 +76,8 @@ func main() {
_ = w(fmt.Errorf("empty: %s", filename))
}
for _, p := range pp {
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/LitNL
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/LitNL/corpus/$ds
cd /net/corpora/nlnieuws/LitNL/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out
mkdir out
../../xml2txt $ds
xml2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -16,14 +16,17 @@ all:
make -C Sargasso
make -C Sikkom
make -C Tzum
make -C Volkskrant
make -C VRT
make bin/data2json
make bin/dates2json
make bin/flush
make bin/items2count
make bin/score
make bin/rang
make bin/top20
make bin/trends
make bin/week2files
make bin/weekadd
bin/data2json: cmd/data2json/*.go
go build -o $@ $^
@@ -37,12 +40,18 @@ bin/flush: cmd/flush/*.go
bin/items2count: cmd/items2count/*.go
go build -o $@ $^
bin/score: cmd/score/*.go
bin/rang: cmd/rang/*.go
go build -o $@ $^
bin/top20: cmd/top20/*.go
go build -o $@ $^
bin/trends: cmd/trends/*.go
go build -o $@ $^
bin/week2files: cmd/week2files/*.go
go build -o $@ $^
bin/weekadd: cmd/weekadd/*.go
go build -o $@ $^

View File

@@ -3,11 +3,11 @@ all: \
metadata \
nos
json2txt: cmd/json2txt/*.go
go build -o $@ $^
json2txt: cmd/json2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
nos: cmd/nos/*.go
go build -o $@ $^
nos: cmd/nos/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,6 +3,8 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/json"
"fmt"
"os"
@@ -26,7 +28,7 @@ type Item struct {
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]$`)
)
func main() {
@@ -34,17 +36,17 @@ func main() {
var ds string
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
t := time.Now().AddDate(0, 0, -2)
ds = fmt.Sprintf("%d-%02d-%02d", t.Year(), int(t.Month()), t.Day())
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy-mm-dd"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := strings.ReplaceAll(ds, "-", "/")
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
x(os.MkdirAll("out", 0777))
@@ -61,13 +63,15 @@ func main() {
x(err)
item := getItem(b, filename)
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)))
x(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(cat)))
}
for _, tag := range item.Tags {
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
}
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
for _, line := range strings.SplitAfter(item.Text, "\n") {
x(fp.WriteString(u.AddEnd(u.FixSpace(line, true))))
}
x(fp.WriteString(addEnd(fixSpace(item.Title))))
x(fp.WriteString(fixSpace(item.Text)))
x(fp.Close())
}
}
@@ -90,27 +94,3 @@ func getItem(b []byte, filename string) Item {
x(json.Unmarshal(b, &item), filename)
return item
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -1,17 +1,17 @@
package main
import (
"html"
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"html"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -63,7 +63,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/NOS/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -94,8 +94,7 @@ func main() {
}
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
if exists(dirname + "/lock") {
continue
}
@@ -195,16 +194,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(os.Chtimes(filename+".json", timestamp, timestamp))
return true
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/NOS
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -2days +%Y-%m-%d`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1
;;
*)
@@ -23,10 +26,11 @@ else
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/NOS/corpus/$ds
cd /net/corpora/nlnieuws/NOS/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +42,7 @@ fi
rm -fr out
mkdir out
../../json2txt $ds
json2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,15 +57,15 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
nu: cmd/nu/*.go
go build -o $@ $^
nu: cmd/nu/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,6 +3,8 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"bytes"
"encoding/json"
"encoding/xml"
@@ -12,7 +14,6 @@ import (
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -73,7 +74,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/NU/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -97,8 +98,7 @@ func main() {
t, err = time.Parse(time.RFC1123, item.PubDate)
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
if exists(dirname + "/lock") {
continue
}
@@ -161,6 +161,8 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
s := string(body)
ok := true
i1 := strings.Index(s, `<script type="application/ld+json"`)
@@ -226,22 +228,9 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
// text bevat kopjes zonder punt aan het eind
lines := strings.Split(text, "\n")
for i, line := range lines {
line = fixSpace(line)
n := len(line)
if n > 0 {
if strings.ContainsAny(line[n-1:], ".!?") {
continue
}
}
if n > 1 {
s := line[n-2:]
if s == `."` || s == `!"` || s == `?"` {
continue
}
}
lines[i] = line + "."
lines[i] = u.AddEnd(u.FixSpace(line, true))
}
text = strings.Join(lines, "\n") + "\n"
text = strings.Join(lines, "") + "\n"
fp, err := os.Create(filename + ".txt")
p(err)
@@ -249,7 +238,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(fmt.Fprintln(fp, "##META text tag ="))
} else {
for _, tag := range tags {
p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
}
}
p(fp.WriteString(text))
@@ -259,20 +248,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
return true
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/NU
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -2days +%Y-%m-%d`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1
;;
*)
@@ -23,10 +26,11 @@ else
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/NU/corpus/$ds
cd /net/corpora/nlnieuws/NU/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +55,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
nieuwsnl: cmd/nieuwsnl/*.go
go build -o $@ $^
nieuwsnl: cmd/nieuwsnl/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"bytes"
"encoding/xml"
"fmt"
@@ -11,7 +13,6 @@ import (
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -64,7 +65,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/NieuwsNL/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -153,6 +154,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)
@@ -173,11 +176,11 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
} else {
for _, a := range aa {
tag = strings.ReplaceAll(a.Content(), "\n", " ")
p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag)))
p(fmt.Fprintf(&buf, "##META text tag = %s\n", u.FixSpace(tag)))
}
}
p(buf.WriteString(addEnd(fixSpace(title))))
p(buf.WriteString(u.AddEnd(u.FixSpace(title))))
// oud: //div[@id="article-blocks"]//p
pp, err := root.Search(`//div[@id="article-blocks"]//div[contains(@class, "paragraph-content")]`)
@@ -204,7 +207,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
return false // echt fout
}
for _, p1 := range pp {
p(buf.WriteString(addEnd(fixSpace(p1.Content()))))
p(buf.WriteString(u.AddEnd(u.FixSpace(p1.Content()))))
}
fp, err := os.Create(filename + ".txt")
@@ -216,40 +219,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,8 +2,11 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/NieuwsNL
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
@@ -25,10 +28,11 @@ else
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/NieuwsNL/corpus/$ds
cd /net/corpora/nlnieuws/NieuwsNL/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -53,14 +57,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -3,11 +3,11 @@ all: \
metadata \
oog
xml2txt: cmd/xml2txt/*.go
go build -o $@ $^
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
oog: cmd/oog/*.go
go build -o $@ $^
oog: cmd/oog/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,13 +3,14 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -46,7 +47,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/Oog/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -76,7 +77,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -111,16 +112,3 @@ func main() {
}
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"os"
@@ -21,7 +23,7 @@ type Item struct {
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
@@ -30,16 +32,16 @@ func main() {
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/Oog/" + dp))
x(os.MkdirAll("out", 0777))
@@ -57,41 +59,20 @@ func main() {
var item Item
x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
t := u.FixSpace(cat)
if t != "Nieuws" {
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
}
}
x(fp.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body/p`)
x(err)
for _, p := range pp {
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Oog
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Oog/corpus/$ds
cd /net/corpora/nlnieuws/Oog/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out
mkdir out
../../xml2txt $ds
xml2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
parool: cmd/parool/*.go
go build -o $@ $^
parool: cmd/parool/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,13 +4,16 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
//"encoding/json"
"encoding/xml"
"fmt"
//"html"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -25,9 +28,16 @@ type ItemT struct {
UnixTime int64 `xml:"unixTime"`
Guid string `xml:"guid"`
Link string `xml:"link"`
Title string `xml:"title"`
Data []byte `xml:",innerxml"`
}
/*
type GraphT struct {
Graph []map[string]any `json:"@graph"`
}
*/
var (
p = e.PanicErr
w = e.WarnErr
@@ -62,7 +72,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/Parool/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -92,7 +102,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -122,18 +132,19 @@ func main() {
p(fp.WriteString("</item>\n"))
p(fp.Close())
p(os.Chtimes(filename+".xml", t, t))
ok = doArticle(filename, item.Link, t, needUpdate)
ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
}()
}
}
func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) (ok bool) {
func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
if exists(filename + ".skip") {
return true
}
if needUpdate {
_ = os.Remove(filename + ".err")
_ = os.Remove(filename + ".html")
// _ = os.Remove(filename + ".json")
_ = os.Remove(filename + ".txt")
} else {
if exists(filename + ".txt") {
@@ -153,9 +164,67 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)
/*
s := string(body)
ok = true
i1 := strings.Index(s, `<script type="application/ld+json"`)
if i1 < 0 {
ok = false
} else {
i1 += strings.Index(s[i1:], `>`) + 1
i2 := i1 + strings.Index(s[i1:], `</script>`)
if i2 < i1 {
ok = false
} else {
s = html.UnescapeString(s[i1:i2])
}
}
if !ok {
_ = w(fmt.Errorf("script jsonld not found: %s", url))
fp, err := os.Create(filename + ".err")
p(err)
p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
p(fp.Close())
p(os.Chtimes(filename+".err", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return false
}
var graph GraphT
p(json.Unmarshal([]byte(s), &graph))
for _, g := range graph.Graph {
t := g["@type"]
switch v := t.(type) {
case string:
if v == "NewsArticle" {
b, err := json.Marshal(g)
p(err)
s = string(b)
}
}
}
fp, err := os.Create(filename + ".json")
p(err)
p(fp.WriteString(s))
p(fp.Close())
p(os.Chtimes(filename+".json", timestamp, timestamp))
*/
root := doc.Root()
articles, err := root.Search(`//article[@id="article-content"]`)
@@ -211,6 +280,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
}
header := headers[0]
isVideo := false
tags := make([]string, 0)
ell, err := header.Search(`.//*[@data-test-id="article-label"]`)
p(err)
@@ -219,25 +289,16 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
}
for _, el := range ell {
s := strings.TrimSpace(el.Content())
if s != "" {
if s != "" && s != "Nieuws" {
tags = append(tags, s)
}
if strings.ToLower(s) == "video" {
isVideo = true
}
}
pars := make([]string, 0)
ell, err = header.Search(`.//*[@data-test-id="article-title"]`)
p(err)
if len(ell) != 1 {
_ = w(fmt.Errorf("found %d titles: %s", len(ell), url))
}
for _, el := range ell {
s := strings.TrimSpace(el.Content())
if s != "" {
pars = append(pars, s)
}
}
found := false
ell, err = header.Search(`.//*[@data-test-id="header-intro"]`)
p(err)
@@ -252,10 +313,10 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
_ = w(fmt.Errorf("no intro: %s", url))
}
specials, err := article.Search(`.//section//aside | .//section//figure | .//section//b`)
specials, err := article.Search(`.//aside | .//figure | .//figcaption | .//section//b`)
p(err)
for _, special := range specials {
special.Remove()
for i := len(specials) - 1; i >= 0; i-- {
specials[i].Remove()
}
ell, err = article.Search(`.//section//*[@data-article-element-index]`)
@@ -287,7 +348,9 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
}
}
if !found {
_ = w(fmt.Errorf("no text, skipping: %s", url))
if !isVideo {
_ = w(fmt.Errorf("no text, skipping: %s", url))
}
fp, err := os.Create(filename + ".skip")
p(fp.WriteString(url + "\n"))
p(err)
@@ -309,12 +372,14 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(fmt.Fprintln(fp, "##META text tag ="))
} else {
for _, tag := range tags {
p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
}
}
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
for _, par := range pars {
p(fp.WriteString(addEnd(fixSpace(par))))
p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
}
p(fp.Close())
@@ -323,43 +388,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
if strings.HasSuffix(s, `.”`) || strings.HasSuffix(s, `!”`) || strings.HasSuffix(s, `?”`) {
return s + "\n"
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Parool
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Parool/corpus/$ds
cd /net/corpora/nlnieuws/Parool/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +56,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

95
README.md Normal file
View File

@@ -0,0 +1,95 @@
# Actuele nieuwsberichten
- voor Alpino: nieuwe namen en nieuwe woorden
- voor *Woord van de maand* ook: toplijsten van personen, plaatsen,
organisaties en andere namen (TODO: url van webapp)
Voor interactief gebruik:
```
query.sh
```
## 1. Verzamelen van berichten
Berichten van NieuwsNL in `NieuwsNL/yyyy/mm/dd/`
Overigen in `[A-Z]*/yyyy/ww/` (weeknummer)
crontab van p209327@colossus
```
# m h dom mon dow command
3 * * * * /net/corpora/nlnieuws/AT5/at5
4 * * * * /net/corpora/nlnieuws/BuurtAdam/buurtadam
5 * * * * /net/corpora/nlnieuws/BuurtGrn/buurtgrn
6 * * * * /net/corpora/nlnieuws/GG/gg
7 * * * * /net/corpora/nlnieuws/HLN/hln
8 * * * * /net/corpora/nlnieuws/LitNL/litnl
9 * * * * /net/corpora/nlnieuws/NieuwsNL/nieuwsnl
10 * * * * /net/corpora/nlnieuws/NOS/nos
11 * * * * /net/corpora/nlnieuws/NU/nu
12 * * * * /net/corpora/nlnieuws/Oog/oog
13 * * * * /net/corpora/nlnieuws/Parool/parool
14 * * * * /net/corpora/nlnieuws/RO/ro
15 * * * * /net/corpora/nlnieuws/RTVNoord/rtvnoord
16 * * * * /net/corpora/nlnieuws/Sargasso/sargasso
17 * * * * /net/corpora/nlnieuws/Sikkom/sikkom
18 * * * * /net/corpora/nlnieuws/Tzum/tzum
19 * * * * /net/corpora/nlnieuws/VRT/vrt
20 * * * * /net/corpora/nlnieuws/Volkskrant/volkskrant
```
## 2. Teksten verwerken: omzetten naar zinnen, parsen, metadata toevoegen
Uitvoer in `[A-Z]*/corpus/`
crontab van p209327@colossus
```
# m h dom mon dow command
# veel data: elke dag
0 1 * * * /net/corpora/nlnieuws/HLN/txt2corpus.sh
0 1 * * * /net/corpora/nlnieuws/NOS/txt2corpus.sh
0 1 * * * /net/corpora/nlnieuws/NU/txt2corpus.sh
0 1 * * * /net/corpora/nlnieuws/NieuwsNL/txt2corpus.sh
0 1 * * * /net/corpora/nlnieuws/VRT/txt2corpus.sh
0 1 * * * /net/corpora/nlnieuws/Volkskrant/txt2corpus.sh
# weinig data: alleen op dinsdag
0 1 * * 2 /net/corpora/nlnieuws/AT5/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/BuurtAdam/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/BuurtGrn/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/GG/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/LitNL/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/Oog/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/Parool/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/RO/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/RTVNoord/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/Sargasso/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/Sikkom/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/Tzum/txt2corpus.sh
```
## 3. Queries uitvoeren, tellingen doen
Tellingen in `data/`
Gegevens voor webapp in `data/json/`
Op woensdag
crontab van p209327@colossus
```
# m h dom mon dow command
0 1 * * 3 /net/corpora/nlnieuws/collect.sh
```
## 4. Data in json op webplatform zetten
crontab van f109308@colossus
```
# m h dom mon dow command
30 0-23/4 * * * rsync -e 'ssh -F /net/aistaff/alfa/.ssh/config' -a --no-g /net/corpora/nlnieuws/data/json/ webalfa:/home/www/f109308/site/wvdm/data
```

View File

@@ -3,11 +3,11 @@ all: \
metadata \
ro
xml2txt: cmd/xml2txt/*.go
go build -o $@ $^
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
ro: cmd/ro/*.go
go build -o $@ $^
ro: cmd/ro/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,13 +3,14 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -28,8 +29,9 @@ type ItemT struct {
}
var (
p = e.PanicErr
agent = "AhrefsBot/7.0"
p = e.PanicErr
//agent = "AhrefsBot/7.0"
agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/149.0.0.0 Safari/537.36"
)
func exists(filename string) bool {
@@ -46,7 +48,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/RO/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -76,7 +78,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -111,16 +113,3 @@ func main() {
}
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -5,6 +5,8 @@ import (
"github.com/jbowtie/gokogiri"
"github.com/pebbe/textcat/v2"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"bytes"
"encoding/xml"
"fmt"
@@ -24,7 +26,7 @@ var (
x = e.ExitErr
w = e.WarnErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
@@ -36,16 +38,16 @@ func main() {
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp))
x(os.MkdirAll("out", 0777))
@@ -61,19 +63,19 @@ func main() {
var buf bytes.Buffer
var item Item
x(xml.Unmarshal(b, &item))
x(buf.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
divs, err := root.Search(`//div[@class="donatieformlinks"]`)
x(err)
for _, div := range divs {
div.Remove()
for i := len(divs) - 1; i >= 0; i-- {
divs[i].Remove()
}
pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`)
x(err)
for _, p := range pp {
x(buf.WriteString(addEnd(fixSpace(p.Content()))))
x(buf.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
text := buf.String()
@@ -90,33 +92,12 @@ func main() {
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
x(err)
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
t := u.FixSpace(cat)
if t != "Artikelen" && t != "cafeyn" {
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
}
}
x(fp.WriteString(text))
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/RO
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/RO/corpus/$ds
cd /net/corpora/nlnieuws/RO/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out
mkdir out
../../xml2txt $ds
xml2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
rtvnoord: cmd/rtvnoord/*.go
go build -o $@ $^
rtvnoord: cmd/rtvnoord/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,6 +3,8 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/json"
"encoding/xml"
"fmt"
@@ -10,7 +12,6 @@ import (
"io"
"net/http"
"os"
"path/filepath"
"strings"
"time"
)
@@ -75,7 +76,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/RTVNoord/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -100,7 +101,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -225,7 +226,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
// text bevat kopjes zonder punt aan het eind
lines := strings.Split(doc.Text, "\n")
for i, line := range lines {
lines[i] = addEnd(fixSpace(line))
lines[i] = u.AddEnd(u.FixSpace(line, true))
}
text := strings.Join(lines, "") + "\n"
@@ -235,16 +236,21 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(fmt.Fprintln(fp, "##META text tag ="))
} else {
for _, tag := range doc.Tags {
p(fmt.Fprintf(fp, "##META text tag = %s\n", strings.ToLower(fixSpace(tag))))
t := strings.ToLower(u.FixSpace(tag))
if strings.HasPrefix(t, "br_") {
continue
}
t = strings.TrimPrefix(t, "tr_")
p(fmt.Fprintf(fp, "##META text tag = %s\n", t))
}
}
if doc.Cat == "" {
p(fmt.Fprintln(fp, "##META text cat ="))
} else {
p(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(doc.Cat)))
p(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(doc.Cat)))
}
p(fp.WriteString(addEnd(doc.Title)))
p(fp.WriteString(u.AddEnd(doc.Title)))
p(fp.WriteString(text))
p(fp.Close())
@@ -252,40 +258,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/RTVNoord
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/RTVNoord/corpus/$ds
cd /net/corpora/nlnieuws/RTVNoord/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,15 +56,15 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -3,11 +3,11 @@ all: \
metadata \
sargasso
xml2txt: cmd/xml2txt/*.go
go build -o $@ $^
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
sargasso: cmd/sargasso/*.go
go build -o $@ $^
sargasso: cmd/sargasso/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,13 +3,14 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -46,7 +47,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/Sargasso/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -76,7 +77,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -111,16 +112,3 @@ func main() {
}
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"os"
@@ -21,7 +23,7 @@ type Item struct {
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
@@ -30,16 +32,16 @@ func main() {
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp))
x(os.MkdirAll("out", 0777))
@@ -57,41 +59,17 @@ func main() {
var item Item
x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
}
x(fp.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body//p`)
x(err)
for _, p := range pp {
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Sargasso
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Sargasso/corpus/$ds
cd /net/corpora/nlnieuws/Sargasso/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out
mkdir out
../../xml2txt $ds
xml2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
sikkom: cmd/sikkom/*.go
go build -o $@ $^
sikkom: cmd/sikkom/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"html"
@@ -11,7 +13,6 @@ import (
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -64,7 +65,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/Sikkom/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -89,7 +90,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -151,6 +152,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
s := string(body)
ok := true
@@ -220,49 +223,12 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
fp, err = os.Create(filename + ".txt")
p(err)
p(fp.WriteString(addEnd(fixSpace(title))))
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
for _, p1 := range pp {
p(fp.WriteString(addEnd(fixSpace(p1.Content()))))
p(fp.WriteString(u.AddEnd(u.FixSpace(p1.Content()))))
}
p(fp.Close())
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Sikkom
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Sikkom/corpus/$ds
cd /net/corpora/nlnieuws/Sikkom/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,7 +56,7 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index

View File

@@ -3,11 +3,11 @@ all: \
metadata \
tzum
xml2txt: cmd/xml2txt/*.go
go build -o $@ $^
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
tzum: cmd/tzum/*.go
go build -o $@ $^
tzum: cmd/tzum/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,13 +3,14 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -46,7 +47,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/Tzum/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -76,7 +77,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -111,16 +112,3 @@ func main() {
}
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"os"
@@ -21,7 +23,7 @@ type Item struct {
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
@@ -30,16 +32,16 @@ func main() {
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp))
x(os.MkdirAll("out", 0777))
@@ -57,10 +59,14 @@ func main() {
var item Item
x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
t := u.FixSpace(cat)
if t == "Nieuws" {
continue
}
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
}
x(fp.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body/p`)
@@ -68,33 +74,9 @@ func main() {
for _, p := range pp {
s := p.Content()
if !strings.Contains(s, "verscheen eerst op Tzum.") {
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
}
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Tzum
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Tzum/corpus/$ds
cd /net/corpora/nlnieuws/Tzum/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out
mkdir out
../../xml2txt $ds
xml2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
vrt: cmd/vrt/*.go
go build -o $@ $^
vrt: cmd/vrt/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"bytes"
"encoding/xml"
"fmt"
@@ -11,7 +13,6 @@ import (
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -78,7 +79,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/VRT/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -109,8 +110,7 @@ func main() {
if t2.After(t) {
t = t2
}
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
if exists(dirname + "/lock") {
continue
}
@@ -179,6 +179,8 @@ func doArticle(filename string, url string, title string, tags []string, cats []
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
/*
s := string(body)
ok := true
@@ -242,18 +244,18 @@ func doArticle(filename string, url string, title string, tags []string, cats []
p(fmt.Fprintln(&buf, "##META text cat ="))
} else {
for _, cat := range cats {
p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat)))
p(fmt.Fprintf(&buf, "##META text cat = %s\n", u.FixSpace(cat)))
}
}
if len(tags) == 0 {
p(fmt.Fprintln(&buf, "##META text tag ="))
} else {
for _, tag := range tags {
p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag)))
p(fmt.Fprintf(&buf, "##META text tag = %s\n", u.FixSpace(tag)))
}
}
_, err = buf.WriteString(addEnd(fixSpace(title)))
_, err = buf.WriteString(u.AddEnd(u.FixSpace(title)))
p(err)
fouten := make([]string, 0)
@@ -262,7 +264,7 @@ func doArticle(filename string, url string, title string, tags []string, cats []
pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//*[contains(@class,"prose-article-body-r")]`)
p(err)
for _, p1 := range pp {
p(fmt.Fprint(&buf, addEnd(fixSpace(p1.Content()))))
p(fmt.Fprint(&buf, u.AddEnd(u.FixSpace(p1.Content()))))
found = true
}
if !found {
@@ -277,7 +279,7 @@ func doArticle(filename string, url string, title string, tags []string, cats []
`//div[@data-sentry-component="ArticleTitle"]//h2`)
p(err)
for _, p1 := range pp {
p(fmt.Fprint(&buf, addEnd(fixSpace(p1.Content()))))
p(fmt.Fprint(&buf, u.AddEnd(u.FixSpace(p1.Content()))))
found = true
}
if !found {
@@ -311,40 +313,3 @@ func doArticle(filename string, url string, title string, tags []string, cats []
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/VRT
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -2days +%Y-%m-%d`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1
;;
*)
@@ -23,10 +26,11 @@ else
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/VRT/corpus/$ds
cd /net/corpora/nlnieuws/VRT/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,15 +55,15 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

9
Volkskrant/Makefile Normal file
View File

@@ -0,0 +1,9 @@
all: \
metadata \
volkskrant
metadata: cmd/metadata/*.go
go build -o $@ $^
volkskrant: cmd/volkskrant/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -0,0 +1,131 @@
package main
import (
e "codeberg.org/pebbe/errors"
"bufio"
"encoding/xml"
"fmt"
"html"
"os"
"strings"
"time"
)
type Item struct {
XMLName xml.Name `xml:"item"`
UnixTime int64 `xml:"unixTime"`
}
var (
x = e.ExitErr
escape = html.EscapeString
data = make(map[string][]string)
location *time.Location
)
func main() {
var err error
location, err = time.LoadLocation("Europe/Amsterdam")
x(err)
files, err := os.ReadDir(".")
x(err)
for _, file := range files {
filename := file.Name()
if strings.HasSuffix(filename, ".txt") {
doText("", filename)
} else if strings.HasSuffix(filename, ".xml") {
doXml("", filename)
}
}
files, err = os.ReadDir("..")
x(err)
for _, file := range files {
filename := file.Name()
if strings.HasSuffix(filename, ".txt") {
doText("../", filename)
} else if strings.HasSuffix(filename, ".xml") {
doXml("../", filename)
}
}
files, err = os.ReadDir("xml")
x(err)
for _, file := range files {
filename := file.Name()
if !strings.HasSuffix(filename, ".xml") {
continue
}
aa := strings.Split(filename, ".")
base := strings.Join(aa[1:len(aa)-2], ".")
b, err := os.ReadFile("xml/" + filename)
x(err)
s := string(b)
i := strings.Index(s, "<alpino") + 1
i += strings.Index(s[i:], "<")
fp, err := os.Create("xml/" + filename + ".tmp")
x(err)
x(fp.WriteString(s[:i]))
x(fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"Volkskrant\"/>\n"))
for _, m := range data[base] {
x(fp.WriteString(" " + m + "\n"))
}
x(fp.WriteString(" </metadata>\n "))
x(fp.WriteString(stripMeta(s[i:])))
x(fp.Close())
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
}
}
func doText(dirname, filename string) {
base := filename[:len(filename)-4]
if _, ok := data[base]; !ok {
data[base] = make([]string, 0)
}
fp, err := os.Open(dirname + filename)
x(err)
defer func() { x(fp.Close()) }()
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "##META") {
continue
}
aa := strings.Fields(line)
if len(aa) > 4 {
data[base] = append(data[base],
fmt.Sprintf(`<meta type="%s" name="%s" value="%s"/>`,
aa[1],
escape(aa[2]),
escape(strings.Join(aa[4:], " "))))
}
}
x(scanner.Err())
}
func doXml(dirname, filename string) {
base := filename[:len(filename)-4]
if _, ok := data[base]; !ok {
data[base] = make([]string, 0)
}
b, err := os.ReadFile(dirname + filename)
x(err)
var item Item
x(xml.Unmarshal(b, &item))
t := time.Unix(item.UnixTime, 0).In(location)
data[base] = append(data[base],
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
t.Year(),
int(t.Month()),
t.Day()))
}
func stripMeta(s string) string {
i1 := strings.Index(s, "<metadata>")
if i1 < 0 {
return s
}
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
}

View File

@@ -0,0 +1,390 @@
package main
import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
//"encoding/json"
"encoding/xml"
"fmt"
//"html"
"io"
"net/http"
"net/url"
"os"
"strings"
"time"
)
type Rss struct {
XMLName xml.Name `xml:"rss"`
Items []ItemT `xml:"channel>item"`
}
type ItemT struct {
PubDate string `xml:"pubDate"`
UnixTime int64 `xml:"unixTime"`
Guid string `xml:"guid"`
Link string `xml:"link"`
Title string `xml:"title"`
Data []byte `xml:",innerxml"`
}
/*
type GraphT struct {
Graph []map[string]any `json:"@graph"`
}
*/
var (
p = e.PanicErr
w = e.WarnErr
agent = "AhrefsBot/7.0"
)
func exists(filename string) bool {
_, err := os.Stat(filename)
return err == nil
}
func fileDate(filename string) string {
b, err := os.ReadFile(filename)
if err != nil {
return ""
}
s := string(b)
i1 := strings.Index(s, "<unixTime>") + 10
i2 := strings.Index(s, "</unixTime>")
if i2 < i1 {
return ""
}
return s[i1:i2]
}
func main() {
defer func() {
if e.Panicked {
_ = recover()
os.Exit(1)
}
}()
myLock := "/net/corpora/nlnieuws/Volkskrant/lock"
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
req, err := http.NewRequest("GET", "https://www.volkskrant.nl/rss.xml", nil)
p(err)
req.Header.Set("User-Agent", agent)
client := &http.Client{}
resp, err := client.Do(req)
p(err)
body, err := io.ReadAll(resp.Body)
p(err)
p(resp.Body.Close())
var rss Rss
p(xml.Unmarshal(body, &rss))
if len(rss.Items) == 0 {
p(fmt.Errorf("len(rss.Items) == 0"))
}
for _, item := range rss.Items {
t, err := time.Parse(time.RFC1123Z, item.PubDate)
if err != nil {
t, err = time.Parse(time.RFC1123, item.PubDate)
}
p(err)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Volkskrant/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
if exists(dirname + "/lock") {
continue
}
basename := item.Guid
filename := dirname + "/" + url.PathEscape(basename)
ts := fmt.Sprintf("%d", t.Unix())
needUpdate := fileDate(filename+".xml") != ts
p(os.MkdirAll(dirname, 0777))
func() {
var ok bool
defer func() {
if e.Panicked {
fmt.Fprintln(os.Stderr, "----", filename)
fmt.Fprintln(os.Stderr, "----", item.Link)
}
if !ok {
_ = os.Remove(filename + ".xml")
}
}()
fp, err := os.Create(filename + ".xml")
p(err)
p(fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n"))
p(fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix()))
p(fp.Write(item.Data))
p(fp.WriteString("</item>\n"))
p(fp.Close())
p(os.Chtimes(filename+".xml", t, t))
ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
}()
}
}
func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
if exists(filename + ".skip") {
return true
}
if needUpdate {
_ = os.Remove(filename + ".err")
_ = os.Remove(filename + ".html")
// _ = os.Remove(filename + ".json")
_ = os.Remove(filename + ".txt")
} else {
if exists(filename + ".txt") {
return true
}
}
time.Sleep(2 * time.Second)
req, err := http.NewRequest("GET", url, nil)
p(err)
req.Header.Set("User-Agent", agent)
client := &http.Client{}
resp, err := client.Do(req)
p(err)
body, err := io.ReadAll(resp.Body)
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)
/*
s := string(body)
ok = true
i1 := strings.Index(s, `<script type="application/ld+json"`)
if i1 < 0 {
ok = false
} else {
i1 += strings.Index(s[i1:], `>`) + 1
i2 := i1 + strings.Index(s[i1:], `</script>`)
if i2 < i1 {
ok = false
} else {
s = html.UnescapeString(s[i1:i2])
}
}
if !ok {
_ = w(fmt.Errorf("script jsonld not found: %s", url))
fp, err := os.Create(filename + ".err")
p(err)
p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
p(fp.Close())
p(os.Chtimes(filename+".err", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return false
}
var graph GraphT
p(json.Unmarshal([]byte(s), &graph))
for _, g := range graph.Graph {
t := g["@type"]
switch v := t.(type) {
case string:
if v == "NewsArticle" {
b, err := json.Marshal(g)
p(err)
s = string(b)
}
}
}
fp, err := os.Create(filename + ".json")
p(err)
p(fp.WriteString(s))
p(fp.Close())
p(os.Chtimes(filename+".json", timestamp, timestamp))
*/
root := doc.Root()
articles, err := root.Search(`//article[@id="article-content"]`)
p(err)
if len(articles) == 0 {
_ = w(fmt.Errorf("empty: %s", url))
fp, err := os.Create(filename + ".err")
p(err)
p(fmt.Fprintf(fp, "empty: %s\n", url))
p(fp.Close())
p(os.Chtimes(filename+".err", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return false
}
article := articles[0]
live, err := article.Search(`.//*[@data-test-id="live-blog-label"]`)
p(err)
if len(live) > 0 {
fp, err := os.Create(filename + ".skip")
p(fp.WriteString("liveblog\n"))
p(err)
p(os.Chtimes(filename+".skip", timestamp, timestamp))
return true
}
headers, err := article.Search(`.//header`)
p(err)
if len(headers) == 0 {
_ = w(fmt.Errorf("no header: %s", url))
fp, err := os.Create(filename + ".err")
p(err)
p(fmt.Fprintf(fp, "no elements: %s\n", url))
p(fp.Close())
p(os.Chtimes(filename+".err", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return false
}
header := headers[0]
isOpinie := false
isColumn := false
tags := make([]string, 0)
ell, err := header.Search(`.//*[@data-test-id="article-label"]`)
p(err)
if len(ell) == 0 {
_ = w(fmt.Errorf("no labels: %s", url))
}
for _, el := range ell {
s := strings.TrimSpace(el.Content())
if s != "" && s != "Nieuws" {
tags = append(tags, s)
}
if s1 := strings.ToLower(s); s1 == "opinie" {
isOpinie = true
} else if s1 == "column" {
isColumn = true
}
}
pars := make([]string, 0)
found := false
ell, err = header.Search(`.//*[@data-test-id="header-intro"]`)
p(err)
for _, el := range ell {
s := strings.TrimSpace(el.Content())
if s != "" {
pars = append(pars, s)
found = true
}
}
if !found && !isOpinie && !isColumn {
_ = w(fmt.Errorf("no intro: %s", url))
}
specials, err := article.Search(`.//aside | .//figure | .//figcaption | .//section//b`)
p(err)
for i := len(specials) - 1; i >= 0; i-- {
specials[i].Remove()
}
ell, err = article.Search(`.//section//*[@data-article-element-index]`)
p(err)
if len(ell) == 0 {
_ = w(fmt.Errorf("no elements: %s", url))
fp, err := os.Create(filename + ".err")
p(err)
p(fmt.Fprintf(fp, "no elements: %s\n", url))
p(fp.Close())
p(os.Chtimes(filename+".err", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return false
}
found = false
for _, el := range ell {
s := strings.TrimSpace(el.Content())
if s != "" {
pars = append(pars, s)
found = true
}
}
if !found {
_ = w(fmt.Errorf("no text, skipping: %s", url))
fp, err := os.Create(filename + ".skip")
p(fp.WriteString(url + "\n"))
p(err)
p(os.Chtimes(filename+".skip", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return true
}
fp, err := os.Create(filename + ".txt")
p(err)
if len(tags) == 0 {
p(fmt.Fprintln(fp, "##META text tag ="))
} else {
for _, tag := range tags {
p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
}
}
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
for _, par := range pars {
p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
}
p(fp.Close())
p(os.Chtimes(filename+".txt", timestamp, timestamp))
return true
}

70
Volkskrant/txt2corpus.sh Executable file
View File

@@ -0,0 +1,70 @@
#!/bin/bash
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Volkskrant
unset CDPATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -2days +%Y-%m-%d`
else
case "$1" in
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1
;;
*)
echo INVALID
exit 1
;;
esac
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
then
echo Getting lock failed
exit 1
fi
rm -fr out
mkdir out
rm -f $corpus.lines
for i in *.txt
do
b=`basename $i .txt`
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
| perl -e '$n = 0; while(<>) { $n++; print("vk.'$b'.$n|$_"); }' \
>> $corpus.lines
done
cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out
rm -f lock

View File

@@ -7,6 +7,7 @@ import (
"encoding/json"
"fmt"
"os"
"regexp"
"strconv"
"strings"
"time"
@@ -49,13 +50,14 @@ var (
parts = map[string]struct {
file string
suffix string
re *regexp.Regexp
}{
"nieuwe namen": {"nieuwe-namen", ".t20"},
"nieuwe woorden": {"nieuwe-woorden-extra", ".t20"},
"personen": {"personen", ""},
"andere namen": {"overige-namen", ""},
"locaties": {"locaties", ""},
"organisaties": {"organisaties", ""},
"nieuwe namen": {"nieuwe-namen", ".t20", nil},
"nieuwe woorden": {"nieuwe-woorden-extra", ".t20", nil},
"personen": {"personen", "", nil},
"andere namen": {"overige-namen", "", nil},
"locaties": {"locaties", "", nil},
"organisaties": {"organisaties", "", regexp.MustCompile(`^(ANP|AT5)`)},
}
maanden = strings.Fields("x januari februari maart april mei juni juli augustus september oktober november december")
@@ -70,9 +72,9 @@ var (
func main() {
aa := strings.Split(os.Args[1], "-")
aa := strings.Split(os.Args[1], ".")
if len(aa) != 2 {
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
}
var err error
@@ -128,7 +130,8 @@ func makeParts(source string) *Parts {
func makeValues(source, part string) [][5]any {
v := make([][5]any, 0)
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%s-%s-%d-%02d-%d%s",
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%d/%s-%s-%d.%02d-%d%s",
year,
sources[source],
parts[part].file,
year,
@@ -141,12 +144,15 @@ func makeValues(source, part string) [][5]any {
scanner := bufio.NewScanner(fp)
lineno := 0
for scanner.Scan() {
lineno++
line := scanner.Text()
aa := strings.Split(line, "\t")
count, err := strconv.Atoi(strings.TrimSpace(aa[0]))
x(err)
word := aa[1]
if parts[part].re != nil && parts[part].re.MatchString(word) {
continue
}
lineno++
var tags, lemma, postag string
if len(aa) > 2 {
tags = aa[2]
@@ -226,7 +232,7 @@ func dates() (start, first, last string, names []string) {
t3 := tStart
for range size {
y, w := t3.ISOWeek()
names = append(names, fmt.Sprintf("%d/%02d", y, w))
names = append(names, fmt.Sprintf("%d/w%02d", y, w))
t3 = t3.AddDate(0, 0, 7)
}
t3 = tStart

View File

@@ -26,13 +26,24 @@ var (
func main() {
files, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
dirs, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
x(err)
for _, dir := range dirs {
if !dir.IsDir() {
continue
}
dirname := dir.Name()
if dirname[0] != '2' {
continue
}
files, err := os.ReadDir("/net/corpora/nlnieuws/data/json/" + dirname)
x(err)
for _, file := range files {
filename := file.Name()
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
addWeek(filename[5:12])
for _, file := range files {
filename := file.Name()
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
addWeek(filename[5:12])
}
}
}

63
cmd/rang/rang.go Normal file
View File

@@ -0,0 +1,63 @@
package main
// alto 'fp://node[....]' 'tt:%w\t%I' $files | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang
import (
e "codeberg.org/pebbe/errors"
"bufio"
"fmt"
"os"
"sort"
"strings"
)
var (
x = e.ExitErr
)
type Item struct {
word string
count int
}
func main() {
counts := make(map[string]int)
scanner := bufio.NewScanner(os.Stdin)
for scanner.Scan() {
word := strings.Split(scanner.Text(), "\t")[0]
counts[word] = counts[word] + 1
}
x(scanner.Err())
items := make([]Item, 0)
for key, value := range counts {
items = append(items, Item{
word: key,
count: value,
})
}
sort.Slice(items, func(a, b int) bool {
if items[a].count == items[b].count {
return items[a].word < items[b].word
}
return items[a].count > items[b].count
})
rang := 0
prev := 0
for _, item := range items {
if item.count < 2 {
break
}
if item.count != prev {
rang++
prev = item.count
}
fmt.Printf("%d\t%s\n", rang, item.word)
}
}

View File

@@ -1,114 +0,0 @@
package main
import (
e "codeberg.org/pebbe/errors"
"bufio"
"fmt"
"os"
"regexp"
"sort"
"strconv"
"strings"
)
type Item struct {
text string
lctext string
score int
isnew bool
}
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`(.*)([12][0-9][0-9][0-9]-[0-5][0-9])(.*)`)
count = make(map[string]int)
items = make([]Item, 0)
)
func main() {
filename := os.Args[1]
prevname := getPrev(filename)
fp, err := os.Open(prevname)
x(err)
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
aa := strings.SplitN(scanner.Text(), "\t", 2)
n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
x(err)
count[aa[1]] = n
}
x(scanner.Err())
x(fp.Close())
fp, err = os.Open(filename)
x(err)
scanner = bufio.NewScanner(fp)
for scanner.Scan() {
aa := strings.SplitN(scanner.Text(), "\t", 2)
n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
x(err)
n1, ok := count[aa[1]]
items = append(items, Item{
text: aa[1],
lctext: strings.ToLower(aa[1]),
score: n - n1,
isnew: !ok,
})
}
x(scanner.Err())
x(fp.Close())
sort.Slice(items, func(i, j int) bool {
/*
if items[i].isnew && !items[j].isnew {
return true
}
if !items[i].isnew && items[j].isnew {
return false
}
*/
if items[i].score != items[j].score {
return items[i].score > items[j].score
}
return items[i].lctext < items[j].lctext
})
for _, item := range items {
/*
if item.score < 2 {
break
}
*/
p := "."
if item.isnew {
p = "N"
}
fmt.Printf("%s\t%4d\t%s\n", p, item.score, item.text)
}
}
func getPrev(filename string) string {
mm := reYearWeek.FindStringSubmatch(filename)
year, err := strconv.Atoi(mm[2][:4])
x(err)
week, err := strconv.Atoi(mm[2][5:])
x(err)
week--
if week == 0 {
week = 53
year--
}
newname := fmt.Sprintf("%s%d-%02d%s", mm[1], year, week, mm[3])
if week == 53 {
_, err := os.Stat(newname)
if err == nil {
return newname
}
newname = fmt.Sprintf("%s%d-%02d%s", mm[1], year, week-1, mm[3])
}
return newname
}

View File

@@ -11,7 +11,7 @@ import (
var (
x = e.ExitErr
reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]-[0-5][0-9])(.*)`)
reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]\.[0-5][0-9])(.*)`)
seen = make(map[string]bool)
)
@@ -23,21 +23,30 @@ func main() {
suffix := m[3] + ".t20"
target := infile + ".t20"
x(os.Chdir("/net/corpora/nlnieuws/data"))
files, err := os.ReadDir(".")
dirs, err := os.ReadDir("..")
x(err)
for _, file := range files {
name := file.Name()
if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target {
fp, err := os.Open(name)
x(err)
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
seen[strings.Split(scanner.Text(), "\t")[1]] = true
for _, dir := range dirs {
if !dir.IsDir() {
continue
}
dirname := dir.Name()
if dirname[0] != '2' {
continue
}
files, err := os.ReadDir("../" + dirname)
x(err)
for _, file := range files {
name := file.Name()
if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target {
fp, err := os.Open("../" + dirname + "/" + name)
x(err)
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
seen[strings.Split(scanner.Text(), "\t")[1]] = true
}
x(scanner.Err())
x(fp.Close())
}
x(scanner.Err())
x(fp.Close())
}
}

108
cmd/trends/trends.go Normal file
View File

@@ -0,0 +1,108 @@
package main
import (
e "codeberg.org/pebbe/errors"
"bufio"
"fmt"
"os"
"sort"
"strconv"
"strings"
)
type Item struct {
word string
diff float64
gone bool
}
var (
x = e.ExitErr
)
func main() {
refs := make(map[string]int)
refmax := 0
fp, err := os.Open(os.Args[1])
x(err)
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
aa := strings.Split(scanner.Text(), "\t")
n, err := strconv.Atoi(aa[0])
x(err)
refs[aa[1]] = n
if n > refmax {
refmax = n
}
}
x(scanner.Err())
fp.Close()
refmax++
lines := make([]string, 0)
fp, err = os.Open(os.Args[2])
x(err)
scanner = bufio.NewScanner(fp)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
x(scanner.Err())
fp.Close()
curmax, err := strconv.Atoi(strings.Split(lines[len(lines)-1], "\t")[0])
x(err)
curmax++
items := make([]Item, 0)
seen := make(map[string]bool)
for _, line := range lines {
aa := strings.Split(line, "\t")
seen[aa[1]] = true
n, err := strconv.Atoi(aa[0])
x(err)
m, ok := refs[aa[1]]
if !ok {
//continue
m = refmax
}
diff := float64(m)/float64(refmax) - float64(n)/float64(curmax)
if diff > 0.05 || diff < -0.05 {
items = append(items, Item{
word: aa[1],
diff: diff,
})
}
}
for key, value := range refs {
if !seen[key] {
diff := float64(value)/float64(refmax) - 1.0
if diff > 0.05 || diff < -0.05 {
items = append(items, Item{
word: key,
diff: diff,
gone: true,
})
}
}
}
sort.Slice(items, func(a, b int) bool {
if items[a].diff == items[b].diff {
return items[a].word < items[b].word
}
return items[a].diff > items[b].diff
})
for _, item := range items {
var s string
if item.gone {
s = "X"
}
fmt.Printf("%f\t%s\t%s\n", item.diff, s, item.word)
}
}

View File

@@ -15,9 +15,9 @@ var (
)
func main() {
aa := strings.Split(os.Args[1], "-")
aa := strings.Split(os.Args[1], ".")
if len(aa) != 2 {
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
}
year, err := strconv.Atoi(aa[0])
@@ -58,7 +58,7 @@ func main() {
fmt.Print(" -or")
}
y, w := t2.ISOWeek()
fmt.Printf(" -name %d-%02d.data.dz", y, w)
fmt.Printf(" -name %d.%02d.data.dz", y, w)
t2 = t2.AddDate(0, 0, 7)
}

50
cmd/weekadd/weekadd.go Normal file
View File

@@ -0,0 +1,50 @@
package main
import (
ww "git.web.rug.nl/p209327/nlnieuws/internal/week"
e "codeberg.org/pebbe/errors"
"fmt"
"io"
"os"
"strconv"
"strings"
)
var (
x = e.ExitErr
)
func syntax(w io.Writer) {
fmt.Fprintf(w, `
Usage: %s yyyy.ww n
`, os.Args[0])
}
func main() {
if len(os.Args) != 3 {
syntax(os.Stdout)
return
}
aa := strings.Split(os.Args[1], ".")
if len(aa) != 2 {
syntax(os.Stderr)
return
}
year, err := strconv.Atoi(aa[0])
x(err)
week, err := strconv.Atoi(aa[1])
x(err)
n, err := strconv.Atoi(os.Args[2])
x(err)
t, err := ww.WeekDate(year, week)
x(err)
y, w := t.AddDate(0, 0, n*7).ISOWeek()
fmt.Printf("%d.%02d\n", y, w)
}

View File

@@ -3,8 +3,9 @@
set -e
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=/net/corpora/nlnieuws/bin:/net/corpora/nlnieuws/python:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/corpora/nlnieuws/python/env/bin/activate
verbose=0
if [ "$1" = "-v" ]
@@ -22,10 +23,10 @@ say () {
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -35,11 +36,17 @@ else
esac
fi
cd /net/corpora/nlnieuws/data
ds1=$(weekadd $ds -1)
year=${ds%%.*}
mkdir -p /net/corpora/nlnieuws/data/$year
mkdir -p /net/corpora/nlnieuws/data/json/$year
cd /net/corpora/nlnieuws/data/$year
declare -A parts
#parts[alles]='.'
parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso'
parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso|Volkskrant'
parts[amsterdam]='AT5|BuurtAdam|Parool'
parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom'
parts[literatuur]='LitNL|Tzum'
@@ -60,6 +67,7 @@ parts[vlaanderen]='HLN|VRT'
#parts[Sargasso]='Sargasso'
#parts[Sikkom]='Sikkom'
#parts[Tzum]='Tzum'
#parts[Volkskrant]='Volkskrant'
#parts[VRT]='VRT'
for part in ${!parts[@]}
@@ -68,7 +76,7 @@ do
for i in 1 4
do
files=$(find .. $(week2files $ds $i) | grep -E "$regex") || true
files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true
if [ -z "$files" ]
then
continue
@@ -158,10 +166,48 @@ do
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
> $part-nieuwe-adjww-extra-$ds-$i
# kale tellingen
say $part-allewoorden-$ds-$i
alto \
'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| sed -e 's/\t.*//' | uniq -c \
| grep -v '^ *1 ' \
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
> $part-allewoorden-$ds-$i
done
# score
say $part-allewoorden-$ds.score14
stijgers.py $part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 $part-allewoorden-$ds.score14
stijgers2json.py $part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 $part-allewoorden-$ds.score14.json
say $part-locaties-$ds.score14
stijgers.py $part-locaties-$ds1-4 $part-locaties-$ds-1 $part-locaties-$ds.score14
stijgers2json.py $part-locaties-$ds1-4 $part-locaties-$ds-1 $part-locaties-$ds.score14.json
say $part-personen-$ds.score14
stijgers.py $part-personen-$ds1-4 $part-personen-$ds-1 $part-personen-$ds.score14
stijgers2json.py $part-personen-$ds1-4 $part-personen-$ds-1 $part-personen-$ds.score14.json
say $part-organisaties-$ds.score14
stijgers.py $part-organisaties-$ds1-4 $part-organisaties-$ds-1 $part-organisaties-$ds.score14
stijgers2json.py $part-organisaties-$ds1-4 $part-organisaties-$ds-1 $part-organisaties-$ds.score14.json
say $part-overige-namen-$ds.score14
stijgers.py $part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 $part-overige-namen-$ds.score14
stijgers2json.py $part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 $part-overige-namen-$ds.score14.json
done
data2json $ds 1 > json/DATA-$ds-1.json
data2json $ds 4 > json/DATA-$ds-4.json
dates2json > json/index1.json
dates2json > json/index4.json
data2json $ds 1 > ../json/$year/DATA-$ds-1.json
data2json $ds 4 > ../json/$year/DATA-$ds-4.json
dates2json > ../json/index1.json
dates2json > ../json/index4.json
# rechten bijwerken
chmod -R g+w /net/corpora/nlnieuws
chgrp -R software /net/corpora/nlnieuws

4
go.mod
View File

@@ -1,11 +1,13 @@
module nlnieuws
module git.web.rug.nl/p209327/nlnieuws
go 1.26.1
require (
codeberg.org/pebbe/errors v0.4.0
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5
github.com/pebbe/compactcorpus v1.0.3
github.com/pebbe/textcat/v2 v2.3.0
github.com/rug-compling/alpinods v1.18.1
)
require github.com/pebbe/util v0.9.0 // indirect

4
go.sum
View File

@@ -2,7 +2,11 @@ codeberg.org/pebbe/errors v0.4.0 h1:G05wsXpC/LRPaL02QYDwtz0sWFWQcIWK1s+MC79LBzU=
codeberg.org/pebbe/errors v0.4.0/go.mod h1:O7PPxUJM1bWRHq11CRK3wqVaH/3NnRaSVZvh3UhzDCY=
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4=
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw=
github.com/pebbe/compactcorpus v1.0.3 h1:6qlfXKHTKg7oWKLPCgEgv1scplfvphg/9l9XiRT2HzQ=
github.com/pebbe/compactcorpus v1.0.3/go.mod h1:SSpTeCZataCjjs82RJb8SOGdjkB3PlR7Z19EY4rInoQ=
github.com/pebbe/textcat/v2 v2.3.0 h1:RB2egIQgI2a2Ls+I9No6KFQKCZBIFt8Cc/SWCnVtC7Y=
github.com/pebbe/textcat/v2 v2.3.0/go.mod h1:WLXWuL+fOlQJqn6LmubjD+e78hCC6Y/rAWInh0wq/kg=
github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA=
github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q=
github.com/rug-compling/alpinods v1.18.1 h1:BvPcCnNEQ1QoVSc0RmwJd3kZmvo4iqZ52/vFzVvFS7w=
github.com/rug-compling/alpinods v1.18.1/go.mod h1:R3BBX8RIw9InVqHZ+1W+MsX8WX8uBkoVNNGE38mqF1Q=

74
internal/util/util.go Normal file
View File

@@ -0,0 +1,74 @@
package util
import (
e "codeberg.org/pebbe/errors"
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
)
var (
p = e.PanicErr
reEOL = regexp.MustCompile(`[.!?]['"”’]?$`)
reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}+\.?`)
reLET = regexp.MustCompile(`\p{Lu}`)
reBody = regexp.MustCompile(`<[bB][rR][ /]*>`)
reQuotLeft = regexp.MustCompile(`<em>|<i>`)
reQuotRight = regexp.MustCompile(`</em>|</i>`)
)
func HtmlFix(html []byte) []byte {
html = reQuotLeft.ReplaceAllLiteral(html, []byte(" „"))
html = reQuotRight.ReplaceAllLiteral(html, []byte("” "))
return reBody.ReplaceAllLiteral(html, []byte(" "))
}
func HtmlFixString(html string) string {
html = reQuotLeft.ReplaceAllLiteralString(html, " „")
html = reQuotRight.ReplaceAllLiteralString(html, "” ")
return reBody.ReplaceAllLiteralString(html, " ")
}
func AddEnd(s string) string {
s = strings.TrimSpace(s)
if s == "" {
return ""
}
if reEOL.MatchString(s) {
return s + "\n"
}
return s + ".\n"
}
func FixSpace(s string, opt ...bool) string {
s = strings.Join(strings.Fields(s), " ")
if len(opt) > 0 && opt[0] {
s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string {
if strings.HasSuffix(s1, ".") {
// zoals: v.Chr.
return s1
}
i := reLET.FindStringIndex(s1)[0]
return s1[:i] + " " + s1[i:]
})
}
return s
}
func MkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

30
internal/week/week.go Normal file
View File

@@ -0,0 +1,30 @@
package week
import (
"fmt"
"time"
)
// returns first date of week
func WeekDate(year, week int) (time.Time, error) {
// 1 januari
t := time.Date(year, 1, 1, 12, 0, 0, 0, time.UTC)
// zoek eerste donderdag
day := int(t.Weekday()) // 0 = zondag
donderdag := 4 - day
if donderdag < 0 {
donderdag = donderdag + 7
}
// schuif naar maandag voor eerste donderdag: donderdag - 3
// in de gewenste week: 7 * (week - 1)
t = t.AddDate(0, 0, donderdag-3+7*(week-1))
y, w := t.ISOWeek()
if y != year || w != week {
return t, fmt.Errorf("week number out of range")
}
return t, nil
}

66
oud/fix.go Normal file
View File

@@ -0,0 +1,66 @@
package main
/*
Dit past corpora aan
Tags verwijderen:
Oog: Nieuws
Parool: Nieuws
RO: Artikelen, cafeyn
RTVNoord: br_*
Tzum: Nieuws
Tags veranderen:
RTVNoord: tr_* → *
*/
import (
e "codeberg.org/pebbe/errors"
cc "github.com/pebbe/compactcorpus"
"github.com/rug-compling/alpinods"
"encoding/xml"
"fmt"
"os"
"strings"
)
var (
x = e.ExitErr
)
func main() {
for _, file := range os.Args[1:] {
base := strings.TrimSuffix(file, ".data.dz")
newfile := base + "-new.data.dz"
incc, err := cc.Open(file)
x(err)
outcc, err := cc.NewCorpus(newfile)
x(err)
r, err := incc.NewRange()
x(err)
for r.HasNext() {
name, data := r.Next()
fmt.Printf("%s %s \r", base, name)
var alpino alpinods.AlpinoDS
x(xml.Unmarshal(data, &alpino))
for i := 0; i < len(alpino.Metadata.Meta); i++ {
if alpino.Metadata.Meta[i].Name != "tag" {
continue
}
if n := alpino.Metadata.Meta[i].Value; n == "Nieuws" || n == "Artikelen" || n == "cafeyn" || strings.HasPrefix(n, "br_") {
alpino.Metadata.Meta = append(alpino.Metadata.Meta[:i], alpino.Metadata.Meta[i+1:]...)
i--
} else if strings.HasPrefix(n, "tr_") {
alpino.Metadata.Meta[i].Value = n[3:]
}
}
outcc.Write(name, []byte(alpino.String()))
}
x(outcc.Close())
}
}

118
oud/rang/collect-rang.sh Normal file
View File

@@ -0,0 +1,118 @@
#!/bin/bash
set -e
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
verbose=0
if [ "$1" = "-v" ]
then
shift
verbose=1
fi
say () {
if [ "$verbose" = "1" ]
then
echo "$*"
fi
}
if [ "$1" = "" ]
then
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
echo INVALID
exit 1
;;
esac
fi
year=${ds%%.*}
mkdir -p /net/corpora/nlnieuws/data/$year
mkdir -p /net/corpora/nlnieuws/data/json/$year
cd /net/corpora/nlnieuws/data/$year
declare -A parts
#parts[alles]='.'
parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso|Volkskrant'
#parts[amsterdam]='AT5|BuurtAdam|Parool'
#parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom'
#parts[literatuur]='LitNL|Tzum'
#parts[vlaanderen]='HLN|VRT'
#parts[AT5]='AT5'
#parts[BuurtAdam]='BuurtAdam'
#parts[BuurtGrn]='BuurtGrn'
#parts[GG]='GG'
#parts[HLN]='HLN'
#parts[LitNL]='LitNL'
#parts[NOS]='NOS'
#parts[NU]='NU'
#parts[NieuwsNL]='NieuwsNL'
#parts[Oog]='Oog'
#parts[Parool]='Parool'
#parts[RO]='RO'
#parts[RTVNoord]='RTVNoord'
#parts[Sargasso]='Sargasso'
#parts[Sikkom]='Sikkom'
#parts[Tzum]='Tzum'
#parts[Volkskrant]='Volkskrant'
#parts[VRT]='VRT'
for part in ${!parts[@]}
do
regex=${parts[$part]}
for i in 1 4
do
files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true
if [ -z "$files" ]
then
continue
fi
say $part-rang-word-$ds-$i
alto \
'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
> $part-rang-word-$ds-$i
say $part-rang-loc-$ds-$i
alto \
'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="LOC" ])]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
> $part-rang-loc-$ds-$i
say $part-rang-per-$ds-$i
alto \
'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="PER" ])]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
> $part-rang-per-$ds-$i
say $part-rang-org-$ds-$i
alto \
'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="ORG" ])]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
> $part-rang-org-$ds-$i
say $part-rang-misc-$ds-$i
alto \
'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="MISC" ])]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
> $part-rang-misc-$ds-$i
done
done

270
python/Claude.md Normal file
View File

@@ -0,0 +1,270 @@
---
title: Claude
updated: 2026-06-18 13:20:25Z
created: 2026-06-18 13:15:32Z
source: https://claude.ai/share/55700eff-d389-48d5-b1f5-cff59059ce61
---
> I have collected a dataset of news articles from the past few weeks. I would like to know which words occur significantly more or less frequently in the last week compared to the four weeks before. What are good methods to do this?
There are several good methods for detecting statistically significant shifts in word frequency between two time periods. Here's an overview:
### Corpus Comparison Methods
**1\. Log-Likelihood Ratio (G²)**
The most widely used method in corpus linguistics (Dunning, 1993). It compares observed vs. expected frequencies under the null hypothesis that both corpora are drawn from the same distribution. It handles low-frequency words better than chi-squared and works well even when the corpora are very different sizes — which matters here since you're comparing 1 week vs. 4 weeks.
**2\. Chi-Squared (χ²) Test**
A simpler alternative, but it's unreliable for low-frequency words (expected count < 5) and can produce many false positives in sparse data. Log-likelihood is generally preferred over it for text.
**3\. Mann-Whitney U / Permutation Tests**
Non-parametric tests that make no distributional assumptions. Useful if you want to compare word rates *per article* rather than pooled counts, since they respect article-level variance.
### Effect Size Measures (use alongside significance)
Significance alone doesn't tell you how *much* a word has changed. Pair it with:
- **%DIFF** simple percentage difference in relative frequency, easy to interpret
- **Bayes Factor (BIC approximation)** penalises complexity, good for prioritising the most meaningful differences
- **Log Ratio** `log2(freq_week5 / freq_weeks1-4)`, gives a symmetric, interpretable effect size: +1 means doubled, 1 means halved
### Practical Considerations
| Issue | Recommendation |
| --- | --- |
| Unequal corpus sizes | Normalize to relative frequencies (per million words) before comparing |
| Multiple comparisons | Apply Benjamini-Hochberg FDR correction you're testing thousands of words simultaneously |
| Low-frequency words | Set a minimum frequency threshold (e.g., appears 5 times total) to reduce noise |
| Stop words | Decide whether to include them they shift too and can be informative (e.g., rising use of "we" vs. "they") |
| Multiword expressions | Consider bigrams/trigrams if single words feel too decontextualised |
### Recommended Pipeline
1. **Tokenize and normalize** (lowercase, optional lemmatization)
2. **Count frequencies** in each time window
3. **Filter** by minimum frequency
4. **Run Log-Likelihood** for significance; compute **Log Ratio** for effect size
5. **Apply FDR correction** across all words
6. **Rank** by effect size among significant words
### Tools & Libraries
- **Python**: `scipy.stats` for chi-squared/G², `statsmodels` for FDR correction, `nltk`/`spacy` for tokenization
- **R**: the `quanteda` package has `textstat_keyness()` built-in, which implements log-likelihood and several other keyness measures out of the box very convenient for exactly this use case
- **Ready-made**: AntConc (free GUI tool) supports keyness analysis directly
Would you like help implementing any of these in code?
> For the %DIFF, do you calculate it as "freq_week5 - freq_weeks1-4" ?
>
> Can you show scipy.stats code for chi-squared and G^2 ?
### %DIFF Formula
It's the relative difference in *normalised* frequencies (not raw counts, since your corpora are different sizes):
```
%DIFF = ((freq_B / size_B) - (freq_A / size_A)) / (freq_A / size_A) * 100
```
Where A is the reference corpus (weeks 14) and B is the target (week 5). So it answers: "by what percentage did the relative frequency go up or down?"
* * *
### Chi-Squared and G² in scipy
```python
import numpy as np
from scipy.stats import chi2_contingency, chi2
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
"""
word : the word being tested
counts_recent : raw count in week 5
counts_reference : raw count in weeks 1-4
total_recent : total tokens in week 5
total_reference : total tokens in weeks 1-4
"""
a = counts_recent # word in recent
b = counts_reference # word in reference
c = total_recent - a # non-word in recent
d = total_reference - b # non-word in reference
contingency = np.array([[a, b],
[c, d]])
# --- Chi-Squared ---
chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
# --- Log-Likelihood (G²) ---
# G² = 2 * sum(observed * log(observed / expected))
# scipy's chi2_contingency with lambda_="log-likelihood" computes this
g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
# --- Effect sizes ---
freq_recent = a / total_recent
freq_reference = b / total_reference
pct_diff = (freq_recent - freq_reference) / freq_reference * 100
# Avoid log(0) with a small epsilon
eps = 1e-9
log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
return {
"word": word,
"freq_recent": freq_recent,
"freq_reference": freq_reference,
"pct_diff": pct_diff,
"log_ratio": log_ratio,
"chi2": chi2_stat,
"p_chi2": p_chi2,
"g2": g2_stat,
"p_g2": p_g2,
}
```
A few notes:
- `correction=False` disables Yates' continuity correction for chi-squared, which is standard for large corpora
- `lambda_="log-likelihood"` is scipy's built-in way to get G²; it uses the same contingency table logic
- Both tests have **1 degree of freedom** for a 2×2 table, so the p-values are directly comparable
* * *
### Running it over a full vocabulary
```python
from statsmodels.stats.multitest import multipletests
results = [
corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
total_recent, total_reference)
for word in counts_recent
]
# FDR correction across all words
p_values = [r["p_g2"] for r in results]
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
for r, p_adj in zip(results, p_adjusted):
r["p_g2_adjusted"] = p_adj
```
Would you like help loading your articles and building the frequency counts to feed into this?
> And what about tf-idf?
TF-IDF is a slightly different tool worth understanding the distinction before deciding whether to use it.
### What TF-IDF actually measures
TF-IDF scores a word relative to how *distinctive* it is within a document compared to a collection. The formula is:
```
TF-IDF(word, doc) = TF(word, doc) × IDF(word, corpus)
```
- **TF** how often the word appears in a specific document
- **IDF** penalises words that appear in many documents (so "the" scores low)
It was designed for **ranking documents by relevance to a query**, not for comparing two time periods.
### The core problem for your use case
If you treat each time period as one big document, TF-IDF collapses to just IDF because TF becomes the same for both "documents" and cancels out. You'd just be finding words that are rare across the whole corpus, which isn't what you want.
You *could* treat each article as a document and then aggregate scores by time period, but then you're measuring **within-article distinctiveness**, not **frequency shifts over time**.
### When TF-IDF *is* useful in your pipeline
| Use case | Good fit? |
| --- | --- |
| Detecting words that increased/decreased in frequency week-over-week | Use G² + log ratio instead |
| Finding the most topically distinctive words *within* a single article | |
| Building features for a classifier or clustering articles by topic | |
| Summarising what a time period was "about" relative to a background corpus | Partially see below |
### The partial exception: keyness as pseudo-TF-IDF
Some researchers compute a **keyness score** by treating the recent period as the "document" and the reference period as the "corpus" which is conceptually similar to TF-IDF. But the statistically rigorous version of this is exactly the log-likelihood + log ratio approach already described. G² is essentially the principled, significance-tested version of what TF-IDF approximates intuitively.
### Bottom line
For your goal words that shifted significantly between week 5 and weeks 14 G² with log ratio is the right tool. TF-IDF would give you a rough proxy at best and a misleading ranking at worst.
> The corpus_stats() function contains the following comment:
>
> `# G² = 2 * sum(observed * log(observed / expected))`
>
> Is this the formula to calculate the log-likelihood ratio? What do the observed and expected variables contain?
Yes, that's the correct formula for G². Let me break it down.
### The full formula
```
G² = 2 × Σ Oᵢ × ln(Oᵢ / Eᵢ)
```
The sum runs over all **four cells** of the contingency table:
| | Recent (week 5) | Reference (weeks 14) |
| --- | --- | --- |
| **Target word** | a | b |
| **All other words** | c | d |
### Observed values
Simply the raw counts from your data:
```
O₁ = a (word in recent)
O₂ = b (word in reference)
O₃ = c (all other words in recent)
O₄ = d (all other words in reference)
```
### Expected values
The counts you would *expect* if the word's relative frequency were identical in both corpora i.e. under the null hypothesis that nothing changed. They're computed from the row and column marginals:
```
E₁ = (a + b) × (a + c) / N
E₂ = (a + b) × (b + d) / N
E₃ = (c + d) × (a + c) / N
E₄ = (c + d) × (b + d) / N
```
Where `N = a + b + c + d` is the total number of tokens across both corpora.
### Concrete example
Say your recent corpus has 10,000 tokens and your reference has 40,000 tokens, and the word "crisis" appears 30 times recently and 60 times in the reference:
```
a = 30, b = 60
c = 9970, d = 39940
N = 50000
E₁ = (30+60) × (30+9970) / 50000 = 90 × 10000 / 50000 = 18
E₂ = (30+60) × (60+39940) / 50000 = 90 × 40000 / 50000 = 72
E₃ = (9970+39940) × 10000 / 50000 = 49910 × 10000 / 50000 = 9982
E₄ = (9970+39940) × 40000 / 50000 = 49910 × 40000 / 50000 = 39928
G² = 2 × (30×ln(30/18) + 60×ln(60/72) + 9970×ln(9970/9982) + 39940×ln(39940/39928))
≈ 2 × (16.2 9.9 1.2 + 1.2)
≈ 12.7
```
"crisis" appears at 3× its expected relative frequency (30/18), and G² = 12.7 exceeds the chi-squared critical value of 3.84 at p < 0.05 with 1 degree of freedom, so the increase is significant.

BIN
python/Claude.pdf Normal file

Binary file not shown.

4762
python/data.txt Normal file

File diff suppressed because it is too large Load Diff

9123
python/namen.html Normal file

File diff suppressed because one or more lines are too long

1563
python/namen.ipynb Normal file

File diff suppressed because it is too large Load Diff

82
python/namen.py Executable file
View File

@@ -0,0 +1,82 @@
#!/net/corpora/nlnieuws/notebook/bin/python3
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import pandas as pd
# waarom werkt dit niet?
pd.set_option('display.max_rows', 40)
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
"""
word : the word being tested
counts_recent : raw count in week 5
counts_reference : raw count in weeks 1-4
total_recent : total tokens in week 5
total_reference : total tokens in weeks 1-4
"""
a = counts_recent # word in recent
b = counts_reference # word in reference
c = total_recent - a # non-word in recent
d = total_reference - b # non-word in reference
contingency = np.array([[a, b],
[c, d]])
# --- Chi-Squared ---
chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
# --- Log-Likelihood (G²) ---
# G² = 2 * sum(observed * log(observed / expected))
# scipy's chi2_contingency with lambda_="log-likelihood" computes this
g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
# --- Effect sizes ---
freq_recent = a / total_recent
freq_reference = b / total_reference
pct_diff = (freq_recent - freq_reference) / freq_reference * 100
# Avoid log(0) with a small epsilon
eps = 1e-9
log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
return {
"word": word,
"freq_recent": freq_recent,
"freq_reference": freq_reference,
"pct_diff": pct_diff,
"log_ratio": log_ratio,
"chi2": chi2_stat,
"p_chi2": p_chi2,
"g2": g2_stat,
"p_g2": p_g2,
}
counts_recent = {}
counts_reference = {}
with open("data.txt", "rt", encoding="utf-8") as fp:
for line in fp:
aa = line.split("\t")
counts_reference[aa[0]] = max(int(aa[1]), 0.5)
counts_recent[aa[0]] = max(int(aa[2]), 0.5)
total_recent = sum(counts_recent.values())
total_reference = sum(counts_reference.values())
results = [
corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
total_recent, total_reference)
for word in counts_recent]
# FDR correction across all words
p_values = [r["p_g2"] for r in results]
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
for r, p_adj in zip(results, p_adjusted):
r["p_g2_adjusted"] = p_adj
results = pd.DataFrame(results)
print(results)
print(results.sort_values('g2'))
print(results.sort_values('pct_diff'))

30
python/score.txt Normal file
View File

@@ -0,0 +1,30 @@
Er zijn twee simpele formules om de "effect size" van het verschil tussen twee
relatieve frequenties te rapporteren:
* %DIFF = (freq_B  - freq_A) / freq_A * 100
Percentage verschil in relatieve frequenties, makkelijk te interpreteren,
maar niet symmetrisch.
* Log Ratio: log2(freq_A / freq_B)
Een symmetrische en interpreteerbare effect size; +1 is een verdubbeling, -1
een halvering
Twee populaire methodes om de significantie van frequentieverschillen te testen
(ook wel keyword extraction):
* Log-Likelihood Ratio (G^2): meest gebruikte methode in Corpus Linguistics.
Vergelijkt observed vs expected frequency.
* Chi-Squared test (X^2): simpeler dan G^2, maar geeft meer false positives
bij sparse data, werkt niet goed met lage frequenties.
Je kunt dan de gebruiker alleen de woorden met significante verschillen laten
zien (dit zijn dan de keywords). Ik heb met behulp van Claude een notebook in
elkaar gezet met een demonstratie van deze methodes:
→ Word freq comparison.ipynb
Er zijn ook geavanceerdere methodes die me te ingewikkeld lijken om te
implementeren, maar ik noem ze voor de volledigheid. In de stylometrie
is Burrow's Zeta populair, deze is bijv. beschikbaar in Stylo
https://github.com/computationalstylistics/stylo onder de oppose()
functie
https://cran.r-project.org/web/packages/stylo/stylo.pdf#Rfn.oppose.1 .
Er is ook een methode die gebruik maakt van Bayesiaanse statistiek en
frequenties uit een achtergrondcorpus, de Fightin' Words methode van
Monroe et al: https://github.com/jmhessel/FightingWords

104
python/stijgers.py Executable file
View File

@@ -0,0 +1,104 @@
#!/usr/bin/env python3
import sys
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import pandas as pd
# waarom werkt dit niet?
pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', None)
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
"""
word : the word being tested
counts_recent : raw count in week 5
counts_reference : raw count in weeks 1-4
total_recent : total tokens in week 5
total_reference : total tokens in weeks 1-4
"""
a = counts_recent # word in recent
b = counts_reference # word in reference
c = total_recent - a # non-word in recent
d = total_reference - b # non-word in reference
contingency = np.array([[a, b],
[c, d]])
# --- Chi-Squared ---
chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
# --- Log-Likelihood (G²) ---
# G² = 2 * sum(observed * log(observed / expected))
# scipy's chi2_contingency with lambda_="log-likelihood" computes this
g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
# --- Effect sizes ---
freq_recent = a / total_recent
freq_reference = b / total_reference
pct_diff = (freq_recent - freq_reference) / freq_reference * 100
# Avoid log(0) with a small epsilon
eps = 1e-9
log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
return {
"word": word,
"freq_recent": freq_recent,
"freq_reference": freq_reference,
"pct_diff": pct_diff,
"log_ratio": log_ratio,
"chi2": chi2_stat,
"p_chi2": p_chi2,
"g2": g2_stat,
"p_g2": p_g2,
}
oudfile = sys.argv[1]
nieuwfile = sys.argv[2]
textfile = sys.argv[3]
counts_recent = {}
counts_reference = {}
with open(oudfile, "rt", encoding="utf-8") as fp:
for line in fp:
aa = line.split("\t")
counts_reference[aa[1].strip()] = int(aa[0])
with open(nieuwfile, "rt", encoding="utf-8") as fp:
for line in fp:
aa = line.split("\t")
counts_recent[aa[1].strip()] = int(aa[0])
for key in counts_recent:
if not key in counts_reference:
counts_reference[key] = 0.5
for key in counts_reference:
if not key in counts_recent:
counts_recent[key] = 0.5
total_recent = sum(counts_recent.values())
total_reference = sum(counts_reference.values())
results = [
corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
total_recent, total_reference)
for word in counts_recent]
# FDR correction across all words
p_values = [r["p_g2"] for r in results]
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
for r, p_adj in zip(results, p_adjusted):
r["p_g2_adjusted"] = p_adj
results = pd.DataFrame(results)
with open(textfile, "wt", encoding="utf-8") as fp:
fp.write("STIJGERS\n")
fp.write(results[results.pct_diff > 0].sort_values('g2', ascending=False)[:40].to_string())
fp.write("\n\nDALERS\n")
fp.write(results[results.pct_diff < 0].sort_values('g2', ascending=False)[:40].to_string())
fp.write("\n")

99
python/stijgers2json.py Executable file
View File

@@ -0,0 +1,99 @@
#!/usr/bin/env python3
import json
import sys
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
"""
word : the word being tested
counts_recent : raw count in week 5
counts_reference : raw count in weeks 1-4
total_recent : total tokens in week 5
total_reference : total tokens in weeks 1-4
"""
a = counts_recent # word in recent
b = counts_reference # word in reference
c = total_recent - a # non-word in recent
d = total_reference - b # non-word in reference
contingency = np.array([[a, b],
[c, d]])
# --- Chi-Squared ---
chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
# --- Log-Likelihood (G²) ---
# G² = 2 * sum(observed * log(observed / expected))
# scipy's chi2_contingency with lambda_="log-likelihood" computes this
g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
# --- Effect sizes ---
freq_recent = a / total_recent
freq_reference = b / total_reference
pct_diff = (freq_recent - freq_reference) / freq_reference * 100
# Avoid log(0) with a small epsilon
eps = 1e-9
log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
return {
"word": word,
"freq_recent": freq_recent,
"freq_reference": freq_reference,
"pct_diff": pct_diff,
"log_ratio": log_ratio,
"chi2": chi2_stat,
"p_chi2": p_chi2,
"g2": g2_stat,
"p_g2": p_g2,
}
oudfile = sys.argv[1]
nieuwfile = sys.argv[2]
jsonfile = sys.argv[3]
counts_recent = {}
counts_reference = {}
with open(oudfile, "rt", encoding="utf-8") as fp:
for line in fp:
aa = line.split("\t")
counts_reference[aa[1].strip()] = int(aa[0])
with open(nieuwfile, "rt", encoding="utf-8") as fp:
for line in fp:
aa = line.split("\t")
counts_recent[aa[1].strip()] = int(aa[0])
for key in counts_recent:
if not key in counts_reference:
counts_reference[key] = 0.5
for key in counts_reference:
if not key in counts_recent:
counts_recent[key] = 0.5
total_recent = sum(counts_recent.values())
total_reference = sum(counts_reference.values())
results = [
corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
total_recent, total_reference)
for word in counts_recent]
# FDR correction across all words
p_values = [r["p_g2"] for r in results]
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
for r, p_adj in zip(results, p_adjusted):
r["p_g2_adjusted"] = p_adj
o = {}
#o['up'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40]
#o['dn'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40]
o['up'] = sorted([x for x in results if x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40]
o['dn'] = sorted([x for x in results if x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40]
with open(jsonfile, "wt", encoding="utf-8") as fp:
json.dump(o, fp)

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More