Compare commits

..

19 Commits

Author SHA1 Message Date
Peter Kleiweg
a8bea0ab44 updates 2026-06-10 13:31:55 +02:00
Peter Kleiweg
d7adc17d4b Volkskrant: columns hebben geen intro 2026-06-09 18:37:19 +02:00
Peter Kleiweg
a9f9e17acf gone gone 2026-06-06 21:05:00 +02:00
Peter Kleiweg
1f4a084624 gone, trends 2026-06-06 17:10:38 +02:00
Peter Kleiweg
9f29222909 ranglijsten 2026-06-06 14:46:01 +02:00
Peter Kleiweg
a76fa21584 update 2026-06-05 16:05:46 +02:00
Peter Kleiweg
efa301cc4a Tzum: <em>Titel</em> -> "Titel" 2026-06-03 17:32:01 +02:00
Peter Kleiweg
14590570ba fix voor zinnen aan elkaar 2026-05-29 17:22:10 +02:00
Peter Kleiweg
ca4e7af8fa tags; .De -> . De 2026-05-29 12:22:57 +02:00
Peter Kleiweg
66581d4e98 data2json.go: aangepast aan grote reorganisatie 2026-05-28 02:59:46 +02:00
Peter Kleiweg
e53049e62f top20.go: aangepast aan grote reorganisatie 2026-05-28 02:36:55 +02:00
Peter Kleiweg
7f23212fc3 app: checkmark bij datumkiezer 2026-05-27 23:19:30 +02:00
Peter Kleiweg
5c651387af grote reorganisatie:
- HLN, NOS, NU, VRT: per week -> per dag
- yyyy-ww -> yyyy.ww
- yyyy*  -> yyyy/yyyy*
etc
2026-05-27 22:42:03 +02:00
Peter Kleiweg
e430ff576b update readme 2026-05-25 12:37:55 +02:00
Peter Kleiweg
9d82f11536 README 2026-05-24 19:13:09 +02:00
Peter Kleiweg
650a13eb4a weg: cmd/score (2) 2026-05-24 16:44:26 +02:00
Peter Kleiweg
bf0407b933 weg: cmd/score 2026-05-24 16:42:12 +02:00
Peter Kleiweg
fcad105a75 helperfuncties naar internal/util (2) 2026-05-24 16:29:56 +02:00
Peter Kleiweg
75832c3132 fix title voor HLN en Parool; helperfuncties naar internal/util 2026-05-24 16:16:21 +02:00
92 changed files with 1778 additions and 1406 deletions

5
.gitignore vendored
View File

@@ -38,14 +38,17 @@ Sikkom/sikkom
Tzum/metadata
Tzum/tzum
Tzum/xml2txt
Volkskrant/metadata
Volkskrant/volkskrant
VRT/metadata
VRT/vrt
bin/data2json
bin/dates2json
bin/flush
bin/items2count
bin/score
bin/rang
bin/top20
bin/trends
bin/week2files
20??
corpus

View File

@@ -3,11 +3,11 @@ all: \
metadata \
at5
xml2txt: cmd/xml2txt/*.go
go build -o $@ $^
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
at5: cmd/at5/*.go
go build -o $@ $^
at5: cmd/at5/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,13 +3,14 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -46,7 +47,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/AT5/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -76,7 +77,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -110,16 +111,3 @@ func main() {
}()
}
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"os"
@@ -20,7 +22,7 @@ type Item struct {
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
@@ -29,16 +31,16 @@ func main() {
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/AT5/" + dp))
x(os.MkdirAll("out", 0777))
@@ -55,39 +57,15 @@ func main() {
x(err)
var item Item
x(xml.Unmarshal(b, &item), filename)
x(fp.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body/p | //body/h2`)
x(err)
for _, p := range pp {
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/AT5
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/AT5/corpus/$ds
cd /net/corpora/nlnieuws/AT5/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out
mkdir out
../../xml2txt $ds
xml2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,7 +58,7 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
buurtadam: cmd/buurtadam/*.go
go build -o $@ $^
buurtadam: cmd/buurtadam/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,13 +4,14 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -64,7 +65,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/BuurtAdam/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -94,7 +95,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -158,6 +159,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)
@@ -202,7 +205,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
}
for _, div := range divs {
p(fp.WriteString(addEnd(fixSpace(div.Content()))))
p(fp.WriteString(u.AddEnd(u.FixSpace(div.Content()))))
}
p(fp.Close())
@@ -211,40 +214,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/BuurtAdam
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/BuurtAdam/corpus/$ds
cd /net/corpora/nlnieuws/BuurtAdam/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +56,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
buurtgrn: cmd/buurtgrn/*.go
go build -o $@ $^
buurtgrn: cmd/buurtgrn/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -6,11 +6,11 @@ import (
"encoding/xml"
"fmt"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -64,7 +64,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/BuurtGrn/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -94,7 +94,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -158,6 +158,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)
@@ -202,7 +204,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
}
for _, div := range divs {
p(fp.WriteString(addEnd(fixSpace(div.Content()))))
p(fp.WriteString(u.AddEnd(u.FixSpace(div.Content()))))
}
p(fp.Close())
@@ -211,40 +213,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/BuurtGrn
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/BuurtGrn/corpus/$ds
cd /net/corpora/nlnieuws/BuurtGrn/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +56,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
gg: cmd/gg/*.go
go build -o $@ $^
gg: cmd/gg/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,13 +4,14 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -64,7 +65,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/GG/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -94,7 +95,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -154,6 +155,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)
@@ -191,10 +194,10 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
fp, err := os.Create(filename + ".txt")
p(err)
p(fp.WriteString(addEnd(fixSpace(title))))
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
for _, el := range ell {
p(fp.WriteString(addEnd(fixSpace(el.Content()))))
p(fp.WriteString(u.AddEnd(u.FixSpace(el.Content()))))
}
p(fp.Close())
@@ -203,40 +206,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/GG
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/GG/corpus/$ds
cd /net/corpora/nlnieuws/GG/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,7 +56,7 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
hln: cmd/hln/*.go
go build -o $@ $^
hln: cmd/hln/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,13 +4,16 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
// "encoding/json"
"encoding/xml"
"fmt"
// "html"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -25,9 +28,16 @@ type ItemT struct {
UnixTime int64 `xml:"unixTime"`
Guid string `xml:"guid"`
Link string `xml:"link"`
Title string `xml:"title"`
Data []byte `xml:",innerxml"`
}
/*
type GraphT struct {
Graph []map[string]any `json:"@graph"`
}
*/
var (
p = e.PanicErr
w = e.WarnErr
@@ -62,7 +72,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/HLN/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -91,8 +101,7 @@ func main() {
t, err = time.Parse(time.RFC1123, item.PubDate)
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
if exists(dirname + "/lock") {
continue
}
@@ -127,18 +136,19 @@ func main() {
p(fp.WriteString("</item>\n"))
p(fp.Close())
p(os.Chtimes(filename+".xml", t, t))
ok = doArticle(filename, item.Link, t, needUpdate)
ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
}()
}
}
func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) (ok bool) {
func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
if exists(filename + ".skip") {
return true
}
if needUpdate {
_ = os.Remove(filename + ".err")
_ = os.Remove(filename + ".html")
// _ = os.Remove(filename + ".json")
_ = os.Remove(filename + ".txt")
} else {
if exists(filename + ".txt") {
@@ -158,6 +168,62 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
/*
s := string(body)
ok = true
i1 := strings.Index(s, `type="application/ld+json"`)
if i1 < 0 {
ok = false
} else {
i1 += strings.Index(s[i1:], `>`) + 1
i2 := i1 + strings.Index(s[i1:], `</script>`)
if i2 < i1 {
ok = false
} else {
s = html.UnescapeString(s[i1:i2])
}
}
if !ok {
_ = w(fmt.Errorf("script jsonld not found: %s", url))
fp, err := os.Create(filename + ".err")
p(err)
p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
p(fp.Close())
p(os.Chtimes(filename+".err", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return false
}
var graph GraphT
p(json.Unmarshal([]byte(s), &graph))
for _, g := range graph.Graph {
t := g["@type"]
switch v := t.(type) {
case string:
if v == "NewsArticle" {
b, err := json.Marshal(g)
p(err)
s = string(b)
}
}
}
fp, err := os.Create(filename + ".json")
p(err)
p(fp.WriteString(s))
p(fp.Close())
p(os.Chtimes(filename+".json", timestamp, timestamp))
*/
doc, err := gokogiri.ParseHtml(body)
p(err)
@@ -196,18 +262,6 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
pars := make([]string, 0)
ell, err = article.Search(`.//*[@data-content-type="TITLE"]`)
p(err)
if len(ell) != 1 {
_ = w(fmt.Errorf("found %d titles: %s", len(ell), url))
}
for _, el := range ell {
s := strings.TrimSpace(el.Content())
if s != "" {
pars = append(pars, s)
}
}
hasIntro := false
ell, err = article.Search(`.//*[@data-content-type="INTRO"]`)
p(err)
@@ -224,8 +278,8 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
specials, err := article.Search(`.//*[@data-content-type="GROUP"]`)
p(err)
for _, special := range specials {
special.Remove()
for i := len(specials) - 1; i >= 0; i-- {
specials[i].Remove()
}
other, err := article.Search(`.//*[@data-content-type="PODCAST"]`)
@@ -285,12 +339,14 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(fmt.Fprintln(fp, "##META text tag ="))
} else {
for _, tag := range tags {
p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
}
}
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
for _, par := range pars {
p(fp.WriteString(addEnd(fixSpace(par))))
p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
}
p(fp.Close())
@@ -299,43 +355,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
if strings.HasSuffix(s, `.”`) || strings.HasSuffix(s, `!”`) || strings.HasSuffix(s, `?”`) {
return s + "\n"
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/HLN
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -2days +%Y-%m-%d`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1
;;
*)
@@ -23,10 +26,11 @@ else
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/HLN/corpus/$ds
cd /net/corpora/nlnieuws/HLN/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +55,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -3,11 +3,11 @@ all: \
metadata \
litnl
xml2txt: cmd/xml2txt/*.go
go build -o $@ $^
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
litnl: cmd/litnl/*.go
go build -o $@ $^
litnl: cmd/litnl/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,13 +3,14 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -46,7 +47,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/LitNL/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -76,7 +77,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -108,16 +109,3 @@ func main() {
}
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"os"
@@ -22,7 +24,7 @@ var (
w = e.WarnErr
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
@@ -31,16 +33,16 @@ func main() {
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/LitNL/" + dp))
x(os.MkdirAll("out", 0777))
@@ -58,10 +60,10 @@ func main() {
var item Item
x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
}
x(fp.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body//p`)
@@ -74,32 +76,8 @@ func main() {
_ = w(fmt.Errorf("empty: %s", filename))
}
for _, p := range pp {
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/LitNL
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/LitNL/corpus/$ds
cd /net/corpora/nlnieuws/LitNL/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out
mkdir out
../../xml2txt $ds
xml2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -16,13 +16,15 @@ all:
make -C Sargasso
make -C Sikkom
make -C Tzum
make -C Volkskrant
make -C VRT
make bin/data2json
make bin/dates2json
make bin/flush
make bin/items2count
make bin/score
make bin/rang
make bin/top20
make bin/trends
make bin/week2files
bin/data2json: cmd/data2json/*.go
@@ -37,12 +39,15 @@ bin/flush: cmd/flush/*.go
bin/items2count: cmd/items2count/*.go
go build -o $@ $^
bin/score: cmd/score/*.go
bin/rang: cmd/rang/*.go
go build -o $@ $^
bin/top20: cmd/top20/*.go
go build -o $@ $^
bin/trends: cmd/trends/*.go
go build -o $@ $^
bin/week2files: cmd/week2files/*.go
go build -o $@ $^

View File

@@ -3,11 +3,11 @@ all: \
metadata \
nos
json2txt: cmd/json2txt/*.go
go build -o $@ $^
json2txt: cmd/json2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
nos: cmd/nos/*.go
go build -o $@ $^
nos: cmd/nos/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,6 +3,8 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/json"
"fmt"
"os"
@@ -26,7 +28,7 @@ type Item struct {
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]$`)
)
func main() {
@@ -34,17 +36,17 @@ func main() {
var ds string
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
t := time.Now().AddDate(0, 0, -2)
ds = fmt.Sprintf("%d-%02d-%02d", t.Year(), int(t.Month()), t.Day())
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy-mm-dd"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := strings.ReplaceAll(ds, "-", "/")
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
x(os.MkdirAll("out", 0777))
@@ -61,13 +63,15 @@ func main() {
x(err)
item := getItem(b, filename)
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)))
x(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(cat)))
}
for _, tag := range item.Tags {
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
}
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
for _, line := range strings.SplitAfter(item.Text, "\n") {
x(fp.WriteString(u.AddEnd(u.FixSpace(line, true))))
}
x(fp.WriteString(addEnd(fixSpace(item.Title))))
x(fp.WriteString(fixSpace(item.Text)))
x(fp.Close())
}
}
@@ -90,27 +94,3 @@ func getItem(b []byte, filename string) Item {
x(json.Unmarshal(b, &item), filename)
return item
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -1,17 +1,17 @@
package main
import (
"html"
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"html"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -63,7 +63,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/NOS/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -94,8 +94,7 @@ func main() {
}
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
if exists(dirname + "/lock") {
continue
}
@@ -195,16 +194,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(os.Chtimes(filename+".json", timestamp, timestamp))
return true
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/NOS
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -2days +%Y-%m-%d`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1
;;
*)
@@ -23,10 +26,11 @@ else
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/NOS/corpus/$ds
cd /net/corpora/nlnieuws/NOS/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +42,7 @@ fi
rm -fr out
mkdir out
../../json2txt $ds
json2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,15 +57,15 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
nu: cmd/nu/*.go
go build -o $@ $^
nu: cmd/nu/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,6 +3,8 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"bytes"
"encoding/json"
"encoding/xml"
@@ -12,7 +14,6 @@ import (
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -73,7 +74,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/NU/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -97,8 +98,7 @@ func main() {
t, err = time.Parse(time.RFC1123, item.PubDate)
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
if exists(dirname + "/lock") {
continue
}
@@ -161,6 +161,8 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
s := string(body)
ok := true
i1 := strings.Index(s, `<script type="application/ld+json"`)
@@ -226,22 +228,9 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
// text bevat kopjes zonder punt aan het eind
lines := strings.Split(text, "\n")
for i, line := range lines {
line = fixSpace(line)
n := len(line)
if n > 0 {
if strings.ContainsAny(line[n-1:], ".!?") {
continue
}
}
if n > 1 {
s := line[n-2:]
if s == `."` || s == `!"` || s == `?"` {
continue
}
}
lines[i] = line + "."
lines[i] = u.AddEnd(u.FixSpace(line, true))
}
text = strings.Join(lines, "\n") + "\n"
text = strings.Join(lines, "") + "\n"
fp, err := os.Create(filename + ".txt")
p(err)
@@ -249,7 +238,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(fmt.Fprintln(fp, "##META text tag ="))
} else {
for _, tag := range tags {
p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
}
}
p(fp.WriteString(text))
@@ -259,20 +248,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
return true
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/NU
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -2days +%Y-%m-%d`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1
;;
*)
@@ -23,10 +26,11 @@ else
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/NU/corpus/$ds
cd /net/corpora/nlnieuws/NU/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +55,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
nieuwsnl: cmd/nieuwsnl/*.go
go build -o $@ $^
nieuwsnl: cmd/nieuwsnl/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"bytes"
"encoding/xml"
"fmt"
@@ -11,7 +13,6 @@ import (
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -64,7 +65,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/NieuwsNL/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -153,6 +154,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)
@@ -173,11 +176,11 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
} else {
for _, a := range aa {
tag = strings.ReplaceAll(a.Content(), "\n", " ")
p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag)))
p(fmt.Fprintf(&buf, "##META text tag = %s\n", u.FixSpace(tag)))
}
}
p(buf.WriteString(addEnd(fixSpace(title))))
p(buf.WriteString(u.AddEnd(u.FixSpace(title))))
// oud: //div[@id="article-blocks"]//p
pp, err := root.Search(`//div[@id="article-blocks"]//div[contains(@class, "paragraph-content")]`)
@@ -204,7 +207,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
return false // echt fout
}
for _, p1 := range pp {
p(buf.WriteString(addEnd(fixSpace(p1.Content()))))
p(buf.WriteString(u.AddEnd(u.FixSpace(p1.Content()))))
}
fp, err := os.Create(filename + ".txt")
@@ -216,40 +219,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,8 +2,11 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/NieuwsNL
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
@@ -25,10 +28,11 @@ else
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/NieuwsNL/corpus/$ds
cd /net/corpora/nlnieuws/NieuwsNL/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -53,14 +57,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -3,11 +3,11 @@ all: \
metadata \
oog
xml2txt: cmd/xml2txt/*.go
go build -o $@ $^
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
oog: cmd/oog/*.go
go build -o $@ $^
oog: cmd/oog/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,13 +3,14 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -46,7 +47,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/Oog/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -76,7 +77,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -111,16 +112,3 @@ func main() {
}
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"os"
@@ -21,7 +23,7 @@ type Item struct {
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
@@ -30,16 +32,16 @@ func main() {
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/Oog/" + dp))
x(os.MkdirAll("out", 0777))
@@ -57,41 +59,20 @@ func main() {
var item Item
x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
t := u.FixSpace(cat)
if t != "Nieuws" {
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
}
}
x(fp.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body/p`)
x(err)
for _, p := range pp {
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Oog
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Oog/corpus/$ds
cd /net/corpora/nlnieuws/Oog/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out
mkdir out
../../xml2txt $ds
xml2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
parool: cmd/parool/*.go
go build -o $@ $^
parool: cmd/parool/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,13 +4,16 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
//"encoding/json"
"encoding/xml"
"fmt"
//"html"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -25,9 +28,16 @@ type ItemT struct {
UnixTime int64 `xml:"unixTime"`
Guid string `xml:"guid"`
Link string `xml:"link"`
Title string `xml:"title"`
Data []byte `xml:",innerxml"`
}
/*
type GraphT struct {
Graph []map[string]any `json:"@graph"`
}
*/
var (
p = e.PanicErr
w = e.WarnErr
@@ -62,7 +72,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/Parool/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -92,7 +102,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -122,18 +132,19 @@ func main() {
p(fp.WriteString("</item>\n"))
p(fp.Close())
p(os.Chtimes(filename+".xml", t, t))
ok = doArticle(filename, item.Link, t, needUpdate)
ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
}()
}
}
func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) (ok bool) {
func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
if exists(filename + ".skip") {
return true
}
if needUpdate {
_ = os.Remove(filename + ".err")
_ = os.Remove(filename + ".html")
// _ = os.Remove(filename + ".json")
_ = os.Remove(filename + ".txt")
} else {
if exists(filename + ".txt") {
@@ -153,9 +164,67 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)
/*
s := string(body)
ok = true
i1 := strings.Index(s, `<script type="application/ld+json"`)
if i1 < 0 {
ok = false
} else {
i1 += strings.Index(s[i1:], `>`) + 1
i2 := i1 + strings.Index(s[i1:], `</script>`)
if i2 < i1 {
ok = false
} else {
s = html.UnescapeString(s[i1:i2])
}
}
if !ok {
_ = w(fmt.Errorf("script jsonld not found: %s", url))
fp, err := os.Create(filename + ".err")
p(err)
p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
p(fp.Close())
p(os.Chtimes(filename+".err", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return false
}
var graph GraphT
p(json.Unmarshal([]byte(s), &graph))
for _, g := range graph.Graph {
t := g["@type"]
switch v := t.(type) {
case string:
if v == "NewsArticle" {
b, err := json.Marshal(g)
p(err)
s = string(b)
}
}
}
fp, err := os.Create(filename + ".json")
p(err)
p(fp.WriteString(s))
p(fp.Close())
p(os.Chtimes(filename+".json", timestamp, timestamp))
*/
root := doc.Root()
articles, err := root.Search(`//article[@id="article-content"]`)
@@ -211,6 +280,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
}
header := headers[0]
isVideo := false
tags := make([]string, 0)
ell, err := header.Search(`.//*[@data-test-id="article-label"]`)
p(err)
@@ -219,25 +289,16 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
}
for _, el := range ell {
s := strings.TrimSpace(el.Content())
if s != "" {
if s != "" && s != "Nieuws" {
tags = append(tags, s)
}
if strings.ToLower(s) == "video" {
isVideo = true
}
}
pars := make([]string, 0)
ell, err = header.Search(`.//*[@data-test-id="article-title"]`)
p(err)
if len(ell) != 1 {
_ = w(fmt.Errorf("found %d titles: %s", len(ell), url))
}
for _, el := range ell {
s := strings.TrimSpace(el.Content())
if s != "" {
pars = append(pars, s)
}
}
found := false
ell, err = header.Search(`.//*[@data-test-id="header-intro"]`)
p(err)
@@ -252,10 +313,10 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
_ = w(fmt.Errorf("no intro: %s", url))
}
specials, err := article.Search(`.//section//aside | .//section//figure | .//section//b`)
specials, err := article.Search(`.//aside | .//figure | .//figcaption | .//section//b`)
p(err)
for _, special := range specials {
special.Remove()
for i := len(specials) - 1; i >= 0; i-- {
specials[i].Remove()
}
ell, err = article.Search(`.//section//*[@data-article-element-index]`)
@@ -287,7 +348,9 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
}
}
if !found {
_ = w(fmt.Errorf("no text, skipping: %s", url))
if !isVideo {
_ = w(fmt.Errorf("no text, skipping: %s", url))
}
fp, err := os.Create(filename + ".skip")
p(fp.WriteString(url + "\n"))
p(err)
@@ -309,12 +372,14 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(fmt.Fprintln(fp, "##META text tag ="))
} else {
for _, tag := range tags {
p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
}
}
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
for _, par := range pars {
p(fp.WriteString(addEnd(fixSpace(par))))
p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
}
p(fp.Close())
@@ -323,43 +388,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
if strings.HasSuffix(s, `.”`) || strings.HasSuffix(s, `!”`) || strings.HasSuffix(s, `?”`) {
return s + "\n"
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Parool
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Parool/corpus/$ds
cd /net/corpora/nlnieuws/Parool/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +56,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

95
README.md Normal file
View File

@@ -0,0 +1,95 @@
# Actuele nieuwsberichten
- voor Alpino: nieuwe namen en nieuwe woorden
- voor *Woord van de maand* ook: toplijsten van personen, plaatsen,
organisaties en andere namen (TODO: url van webapp)
Voor interactief gebruik:
```
query.sh
```
## 1. Verzamelen van berichten
Berichten van NieuwsNL in `NieuwsNL/yyyy/mm/dd/`
Overigen in `[A-Z]*/yyyy/ww/` (weeknummer)
crontab van p209327@colossus
```
# m h dom mon dow command
3 * * * * /net/corpora/nlnieuws/AT5/at5
4 * * * * /net/corpora/nlnieuws/BuurtAdam/buurtadam
5 * * * * /net/corpora/nlnieuws/BuurtGrn/buurtgrn
6 * * * * /net/corpora/nlnieuws/GG/gg
7 * * * * /net/corpora/nlnieuws/HLN/hln
8 * * * * /net/corpora/nlnieuws/LitNL/litnl
9 * * * * /net/corpora/nlnieuws/NieuwsNL/nieuwsnl
10 * * * * /net/corpora/nlnieuws/NOS/nos
11 * * * * /net/corpora/nlnieuws/NU/nu
12 * * * * /net/corpora/nlnieuws/Oog/oog
13 * * * * /net/corpora/nlnieuws/Parool/parool
14 * * * * /net/corpora/nlnieuws/RO/ro
15 * * * * /net/corpora/nlnieuws/RTVNoord/rtvnoord
16 * * * * /net/corpora/nlnieuws/Sargasso/sargasso
17 * * * * /net/corpora/nlnieuws/Sikkom/sikkom
18 * * * * /net/corpora/nlnieuws/Tzum/tzum
19 * * * * /net/corpora/nlnieuws/VRT/vrt
20 * * * * /net/corpora/nlnieuws/Volkskrant/volkskrant
```
## 2. Teksten verwerken: omzetten naar zinnen, parsen, metadata toevoegen
Uitvoer in `[A-Z]*/corpus/`
crontab van p209327@colossus
```
# m h dom mon dow command
# veel data: elke dag
0 1 * * * /net/corpora/nlnieuws/HLN/txt2corpus.sh
0 1 * * * /net/corpora/nlnieuws/NOS/txt2corpus.sh
0 1 * * * /net/corpora/nlnieuws/NU/txt2corpus.sh
0 1 * * * /net/corpora/nlnieuws/NieuwsNL/txt2corpus.sh
0 1 * * * /net/corpora/nlnieuws/VRT/txt2corpus.sh
0 1 * * * /net/corpora/nlnieuws/Volkskrant/txt2corpus.sh
# weinig data: alleen op dinsdag
0 1 * * 2 /net/corpora/nlnieuws/AT5/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/BuurtAdam/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/BuurtGrn/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/GG/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/LitNL/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/Oog/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/Parool/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/RO/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/RTVNoord/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/Sargasso/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/Sikkom/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/Tzum/txt2corpus.sh
```
## 3. Queries uitvoeren, tellingen doen
Tellingen in `data/`
Gegevens voor webapp in `data/json/`
Op woensdag
crontab van p209327@colossus
```
# m h dom mon dow command
0 1 * * 3 /net/corpora/nlnieuws/collect.sh
```
## 4. Data in json op webplatform zetten
crontab van f109308@colossus
```
# m h dom mon dow command
30 0-23/4 * * * rsync -e 'ssh -F /net/aistaff/alfa/.ssh/config' -a --no-g /net/corpora/nlnieuws/data/json/ webalfa:/home/www/f109308/site/wvdm/data
```

View File

@@ -3,11 +3,11 @@ all: \
metadata \
ro
xml2txt: cmd/xml2txt/*.go
go build -o $@ $^
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
ro: cmd/ro/*.go
go build -o $@ $^
ro: cmd/ro/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,13 +3,14 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -46,7 +47,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/RO/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -76,7 +77,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -111,16 +112,3 @@ func main() {
}
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -5,6 +5,8 @@ import (
"github.com/jbowtie/gokogiri"
"github.com/pebbe/textcat/v2"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"bytes"
"encoding/xml"
"fmt"
@@ -24,7 +26,7 @@ var (
x = e.ExitErr
w = e.WarnErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
@@ -36,16 +38,16 @@ func main() {
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp))
x(os.MkdirAll("out", 0777))
@@ -61,19 +63,19 @@ func main() {
var buf bytes.Buffer
var item Item
x(xml.Unmarshal(b, &item))
x(buf.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
divs, err := root.Search(`//div[@class="donatieformlinks"]`)
x(err)
for _, div := range divs {
div.Remove()
for i := len(divs) - 1; i >= 0; i-- {
divs[i].Remove()
}
pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`)
x(err)
for _, p := range pp {
x(buf.WriteString(addEnd(fixSpace(p.Content()))))
x(buf.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
text := buf.String()
@@ -90,33 +92,12 @@ func main() {
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
x(err)
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
t := u.FixSpace(cat)
if t != "Artikelen" && t != "cafeyn" {
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
}
}
x(fp.WriteString(text))
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/RO
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/RO/corpus/$ds
cd /net/corpora/nlnieuws/RO/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out
mkdir out
../../xml2txt $ds
xml2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
rtvnoord: cmd/rtvnoord/*.go
go build -o $@ $^
rtvnoord: cmd/rtvnoord/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,6 +3,8 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/json"
"encoding/xml"
"fmt"
@@ -10,7 +12,6 @@ import (
"io"
"net/http"
"os"
"path/filepath"
"strings"
"time"
)
@@ -75,7 +76,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/RTVNoord/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -100,7 +101,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -225,7 +226,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
// text bevat kopjes zonder punt aan het eind
lines := strings.Split(doc.Text, "\n")
for i, line := range lines {
lines[i] = addEnd(fixSpace(line))
lines[i] = u.AddEnd(u.FixSpace(line, true))
}
text := strings.Join(lines, "") + "\n"
@@ -235,16 +236,21 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(fmt.Fprintln(fp, "##META text tag ="))
} else {
for _, tag := range doc.Tags {
p(fmt.Fprintf(fp, "##META text tag = %s\n", strings.ToLower(fixSpace(tag))))
t := strings.ToLower(u.FixSpace(tag))
if strings.HasPrefix(t, "br_") {
continue
}
t = strings.TrimPrefix(t, "tr_")
p(fmt.Fprintf(fp, "##META text tag = %s\n", t))
}
}
if doc.Cat == "" {
p(fmt.Fprintln(fp, "##META text cat ="))
} else {
p(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(doc.Cat)))
p(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(doc.Cat)))
}
p(fp.WriteString(addEnd(doc.Title)))
p(fp.WriteString(u.AddEnd(doc.Title)))
p(fp.WriteString(text))
p(fp.Close())
@@ -252,40 +258,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/RTVNoord
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/RTVNoord/corpus/$ds
cd /net/corpora/nlnieuws/RTVNoord/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,15 +56,15 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -3,11 +3,11 @@ all: \
metadata \
sargasso
xml2txt: cmd/xml2txt/*.go
go build -o $@ $^
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
sargasso: cmd/sargasso/*.go
go build -o $@ $^
sargasso: cmd/sargasso/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,13 +3,14 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -46,7 +47,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/Sargasso/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -76,7 +77,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -111,16 +112,3 @@ func main() {
}
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"os"
@@ -21,7 +23,7 @@ type Item struct {
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
@@ -30,16 +32,16 @@ func main() {
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp))
x(os.MkdirAll("out", 0777))
@@ -57,41 +59,17 @@ func main() {
var item Item
x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
}
x(fp.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body//p`)
x(err)
for _, p := range pp {
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Sargasso
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Sargasso/corpus/$ds
cd /net/corpora/nlnieuws/Sargasso/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out
mkdir out
../../xml2txt $ds
xml2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
sikkom: cmd/sikkom/*.go
go build -o $@ $^
sikkom: cmd/sikkom/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"html"
@@ -11,7 +13,6 @@ import (
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -64,7 +65,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/Sikkom/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -89,7 +90,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -151,6 +152,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
s := string(body)
ok := true
@@ -220,49 +223,12 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
fp, err = os.Create(filename + ".txt")
p(err)
p(fp.WriteString(addEnd(fixSpace(title))))
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
for _, p1 := range pp {
p(fp.WriteString(addEnd(fixSpace(p1.Content()))))
p(fp.WriteString(u.AddEnd(u.FixSpace(p1.Content()))))
}
p(fp.Close())
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Sikkom
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Sikkom/corpus/$ds
cd /net/corpora/nlnieuws/Sikkom/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,7 +56,7 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index

View File

@@ -3,11 +3,11 @@ all: \
metadata \
tzum
xml2txt: cmd/xml2txt/*.go
go build -o $@ $^
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
go build -o $@ $<
metadata: cmd/metadata/*.go
go build -o $@ $^
tzum: cmd/tzum/*.go
go build -o $@ $^
tzum: cmd/tzum/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -3,13 +3,14 @@ package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -46,7 +47,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/Tzum/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -76,7 +77,7 @@ func main() {
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/w%02d", year, week)
if exists(dirname + "/lock") {
continue
}
@@ -111,16 +112,3 @@ func main() {
}
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"os"
@@ -21,7 +23,7 @@ type Item struct {
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
@@ -30,16 +32,16 @@ func main() {
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp))
x(os.MkdirAll("out", 0777))
@@ -57,10 +59,14 @@ func main() {
var item Item
x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
t := u.FixSpace(cat)
if t == "Nieuws" {
continue
}
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
}
x(fp.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body/p`)
@@ -68,33 +74,9 @@ func main() {
for _, p := range pp {
s := p.Content()
if !strings.Contains(s, "verscheen eerst op Tzum.") {
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
}
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Tzum
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -22,11 +25,13 @@ else
esac
fi
dp=${ds//-//}
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Tzum/corpus/$ds
cd /net/corpora/nlnieuws/Tzum/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out
mkdir out
../../xml2txt $ds
xml2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

View File

@@ -5,5 +5,5 @@ all: \
metadata: cmd/metadata/*.go
go build -o $@ $^
vrt: cmd/vrt/*.go
go build -o $@ $^
vrt: cmd/vrt/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -4,6 +4,8 @@ import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"bytes"
"encoding/xml"
"fmt"
@@ -11,7 +13,6 @@ import (
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
@@ -78,7 +79,7 @@ func main() {
}()
myLock := "/net/corpora/nlnieuws/VRT/lock"
mkLock(myLock)
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
@@ -109,8 +110,7 @@ func main() {
if t2.After(t) {
t = t2
}
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
if exists(dirname + "/lock") {
continue
}
@@ -179,6 +179,8 @@ func doArticle(filename string, url string, title string, tags []string, cats []
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
/*
s := string(body)
ok := true
@@ -242,18 +244,18 @@ func doArticle(filename string, url string, title string, tags []string, cats []
p(fmt.Fprintln(&buf, "##META text cat ="))
} else {
for _, cat := range cats {
p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat)))
p(fmt.Fprintf(&buf, "##META text cat = %s\n", u.FixSpace(cat)))
}
}
if len(tags) == 0 {
p(fmt.Fprintln(&buf, "##META text tag ="))
} else {
for _, tag := range tags {
p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag)))
p(fmt.Fprintf(&buf, "##META text tag = %s\n", u.FixSpace(tag)))
}
}
_, err = buf.WriteString(addEnd(fixSpace(title)))
_, err = buf.WriteString(u.AddEnd(u.FixSpace(title)))
p(err)
fouten := make([]string, 0)
@@ -262,7 +264,7 @@ func doArticle(filename string, url string, title string, tags []string, cats []
pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//*[contains(@class,"prose-article-body-r")]`)
p(err)
for _, p1 := range pp {
p(fmt.Fprint(&buf, addEnd(fixSpace(p1.Content()))))
p(fmt.Fprint(&buf, u.AddEnd(u.FixSpace(p1.Content()))))
found = true
}
if !found {
@@ -277,7 +279,7 @@ func doArticle(filename string, url string, title string, tags []string, cats []
`//div[@data-sentry-component="ArticleTitle"]//h2`)
p(err)
for _, p1 := range pp {
p(fmt.Fprint(&buf, addEnd(fixSpace(p1.Content()))))
p(fmt.Fprint(&buf, u.AddEnd(u.FixSpace(p1.Content()))))
found = true
}
if !found {
@@ -311,40 +313,3 @@ func doArticle(filename string, url string, title string, tags []string, cats []
return true
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func mkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/VRT
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -2days +%Y-%m-%d`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1
;;
*)
@@ -23,10 +26,11 @@ else
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/VRT/corpus/$ds
cd /net/corpora/nlnieuws/VRT/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -51,15 +55,15 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out

9
Volkskrant/Makefile Normal file
View File

@@ -0,0 +1,9 @@
all: \
metadata \
volkskrant
metadata: cmd/metadata/*.go
go build -o $@ $^
volkskrant: cmd/volkskrant/*.go ../internal/util/*.go
go build -o $@ $<

View File

@@ -0,0 +1,131 @@
package main
import (
e "codeberg.org/pebbe/errors"
"bufio"
"encoding/xml"
"fmt"
"html"
"os"
"strings"
"time"
)
type Item struct {
XMLName xml.Name `xml:"item"`
UnixTime int64 `xml:"unixTime"`
}
var (
x = e.ExitErr
escape = html.EscapeString
data = make(map[string][]string)
location *time.Location
)
func main() {
var err error
location, err = time.LoadLocation("Europe/Amsterdam")
x(err)
files, err := os.ReadDir(".")
x(err)
for _, file := range files {
filename := file.Name()
if strings.HasSuffix(filename, ".txt") {
doText("", filename)
} else if strings.HasSuffix(filename, ".xml") {
doXml("", filename)
}
}
files, err = os.ReadDir("..")
x(err)
for _, file := range files {
filename := file.Name()
if strings.HasSuffix(filename, ".txt") {
doText("../", filename)
} else if strings.HasSuffix(filename, ".xml") {
doXml("../", filename)
}
}
files, err = os.ReadDir("xml")
x(err)
for _, file := range files {
filename := file.Name()
if !strings.HasSuffix(filename, ".xml") {
continue
}
aa := strings.Split(filename, ".")
base := strings.Join(aa[1:len(aa)-2], ".")
b, err := os.ReadFile("xml/" + filename)
x(err)
s := string(b)
i := strings.Index(s, "<alpino") + 1
i += strings.Index(s[i:], "<")
fp, err := os.Create("xml/" + filename + ".tmp")
x(err)
x(fp.WriteString(s[:i]))
x(fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"Volkskrant\"/>\n"))
for _, m := range data[base] {
x(fp.WriteString(" " + m + "\n"))
}
x(fp.WriteString(" </metadata>\n "))
x(fp.WriteString(stripMeta(s[i:])))
x(fp.Close())
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
}
}
func doText(dirname, filename string) {
base := filename[:len(filename)-4]
if _, ok := data[base]; !ok {
data[base] = make([]string, 0)
}
fp, err := os.Open(dirname + filename)
x(err)
defer func() { x(fp.Close()) }()
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "##META") {
continue
}
aa := strings.Fields(line)
if len(aa) > 4 {
data[base] = append(data[base],
fmt.Sprintf(`<meta type="%s" name="%s" value="%s"/>`,
aa[1],
escape(aa[2]),
escape(strings.Join(aa[4:], " "))))
}
}
x(scanner.Err())
}
func doXml(dirname, filename string) {
base := filename[:len(filename)-4]
if _, ok := data[base]; !ok {
data[base] = make([]string, 0)
}
b, err := os.ReadFile(dirname + filename)
x(err)
var item Item
x(xml.Unmarshal(b, &item))
t := time.Unix(item.UnixTime, 0).In(location)
data[base] = append(data[base],
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
t.Year(),
int(t.Month()),
t.Day()))
}
func stripMeta(s string) string {
i1 := strings.Index(s, "<metadata>")
if i1 < 0 {
return s
}
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
}

View File

@@ -0,0 +1,390 @@
package main
import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
//"encoding/json"
"encoding/xml"
"fmt"
//"html"
"io"
"net/http"
"net/url"
"os"
"strings"
"time"
)
type Rss struct {
XMLName xml.Name `xml:"rss"`
Items []ItemT `xml:"channel>item"`
}
type ItemT struct {
PubDate string `xml:"pubDate"`
UnixTime int64 `xml:"unixTime"`
Guid string `xml:"guid"`
Link string `xml:"link"`
Title string `xml:"title"`
Data []byte `xml:",innerxml"`
}
/*
type GraphT struct {
Graph []map[string]any `json:"@graph"`
}
*/
var (
p = e.PanicErr
w = e.WarnErr
agent = "AhrefsBot/7.0"
)
func exists(filename string) bool {
_, err := os.Stat(filename)
return err == nil
}
func fileDate(filename string) string {
b, err := os.ReadFile(filename)
if err != nil {
return ""
}
s := string(b)
i1 := strings.Index(s, "<unixTime>") + 10
i2 := strings.Index(s, "</unixTime>")
if i2 < i1 {
return ""
}
return s[i1:i2]
}
func main() {
defer func() {
if e.Panicked {
_ = recover()
os.Exit(1)
}
}()
myLock := "/net/corpora/nlnieuws/Volkskrant/lock"
u.MkLock(myLock)
defer func() {
_ = os.Remove(myLock)
}()
req, err := http.NewRequest("GET", "https://www.volkskrant.nl/rss.xml", nil)
p(err)
req.Header.Set("User-Agent", agent)
client := &http.Client{}
resp, err := client.Do(req)
p(err)
body, err := io.ReadAll(resp.Body)
p(err)
p(resp.Body.Close())
var rss Rss
p(xml.Unmarshal(body, &rss))
if len(rss.Items) == 0 {
p(fmt.Errorf("len(rss.Items) == 0"))
}
for _, item := range rss.Items {
t, err := time.Parse(time.RFC1123Z, item.PubDate)
if err != nil {
t, err = time.Parse(time.RFC1123, item.PubDate)
}
p(err)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Volkskrant/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
if exists(dirname + "/lock") {
continue
}
basename := item.Guid
filename := dirname + "/" + url.PathEscape(basename)
ts := fmt.Sprintf("%d", t.Unix())
needUpdate := fileDate(filename+".xml") != ts
p(os.MkdirAll(dirname, 0777))
func() {
var ok bool
defer func() {
if e.Panicked {
fmt.Fprintln(os.Stderr, "----", filename)
fmt.Fprintln(os.Stderr, "----", item.Link)
}
if !ok {
_ = os.Remove(filename + ".xml")
}
}()
fp, err := os.Create(filename + ".xml")
p(err)
p(fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n"))
p(fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix()))
p(fp.Write(item.Data))
p(fp.WriteString("</item>\n"))
p(fp.Close())
p(os.Chtimes(filename+".xml", t, t))
ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
}()
}
}
func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
if exists(filename + ".skip") {
return true
}
if needUpdate {
_ = os.Remove(filename + ".err")
_ = os.Remove(filename + ".html")
// _ = os.Remove(filename + ".json")
_ = os.Remove(filename + ".txt")
} else {
if exists(filename + ".txt") {
return true
}
}
time.Sleep(2 * time.Second)
req, err := http.NewRequest("GET", url, nil)
p(err)
req.Header.Set("User-Agent", agent)
client := &http.Client{}
resp, err := client.Do(req)
p(err)
body, err := io.ReadAll(resp.Body)
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)
/*
s := string(body)
ok = true
i1 := strings.Index(s, `<script type="application/ld+json"`)
if i1 < 0 {
ok = false
} else {
i1 += strings.Index(s[i1:], `>`) + 1
i2 := i1 + strings.Index(s[i1:], `</script>`)
if i2 < i1 {
ok = false
} else {
s = html.UnescapeString(s[i1:i2])
}
}
if !ok {
_ = w(fmt.Errorf("script jsonld not found: %s", url))
fp, err := os.Create(filename + ".err")
p(err)
p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
p(fp.Close())
p(os.Chtimes(filename+".err", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return false
}
var graph GraphT
p(json.Unmarshal([]byte(s), &graph))
for _, g := range graph.Graph {
t := g["@type"]
switch v := t.(type) {
case string:
if v == "NewsArticle" {
b, err := json.Marshal(g)
p(err)
s = string(b)
}
}
}
fp, err := os.Create(filename + ".json")
p(err)
p(fp.WriteString(s))
p(fp.Close())
p(os.Chtimes(filename+".json", timestamp, timestamp))
*/
root := doc.Root()
articles, err := root.Search(`//article[@id="article-content"]`)
p(err)
if len(articles) == 0 {
_ = w(fmt.Errorf("empty: %s", url))
fp, err := os.Create(filename + ".err")
p(err)
p(fmt.Fprintf(fp, "empty: %s\n", url))
p(fp.Close())
p(os.Chtimes(filename+".err", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return false
}
article := articles[0]
live, err := article.Search(`.//*[@data-test-id="live-blog-label"]`)
p(err)
if len(live) > 0 {
fp, err := os.Create(filename + ".skip")
p(fp.WriteString("liveblog\n"))
p(err)
p(os.Chtimes(filename+".skip", timestamp, timestamp))
return true
}
headers, err := article.Search(`.//header`)
p(err)
if len(headers) == 0 {
_ = w(fmt.Errorf("no header: %s", url))
fp, err := os.Create(filename + ".err")
p(err)
p(fmt.Fprintf(fp, "no elements: %s\n", url))
p(fp.Close())
p(os.Chtimes(filename+".err", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return false
}
header := headers[0]
isOpinie := false
isColumn := false
tags := make([]string, 0)
ell, err := header.Search(`.//*[@data-test-id="article-label"]`)
p(err)
if len(ell) == 0 {
_ = w(fmt.Errorf("no labels: %s", url))
}
for _, el := range ell {
s := strings.TrimSpace(el.Content())
if s != "" && s != "Nieuws" {
tags = append(tags, s)
}
if s1 := strings.ToLower(s); s1 == "opinie" {
isOpinie = true
} else if s1 == "column" {
isColumn = true
}
}
pars := make([]string, 0)
found := false
ell, err = header.Search(`.//*[@data-test-id="header-intro"]`)
p(err)
for _, el := range ell {
s := strings.TrimSpace(el.Content())
if s != "" {
pars = append(pars, s)
found = true
}
}
if !found && !isOpinie && !isColumn {
_ = w(fmt.Errorf("no intro: %s", url))
}
specials, err := article.Search(`.//aside | .//figure | .//figcaption | .//section//b`)
p(err)
for i := len(specials) - 1; i >= 0; i-- {
specials[i].Remove()
}
ell, err = article.Search(`.//section//*[@data-article-element-index]`)
p(err)
if len(ell) == 0 {
_ = w(fmt.Errorf("no elements: %s", url))
fp, err := os.Create(filename + ".err")
p(err)
p(fmt.Fprintf(fp, "no elements: %s\n", url))
p(fp.Close())
p(os.Chtimes(filename+".err", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return false
}
found = false
for _, el := range ell {
s := strings.TrimSpace(el.Content())
if s != "" {
pars = append(pars, s)
found = true
}
}
if !found {
_ = w(fmt.Errorf("no text, skipping: %s", url))
fp, err := os.Create(filename + ".skip")
p(fp.WriteString(url + "\n"))
p(err)
p(os.Chtimes(filename+".skip", timestamp, timestamp))
fp, err = os.Create(filename + ".html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".html", timestamp, timestamp))
return true
}
fp, err := os.Create(filename + ".txt")
p(err)
if len(tags) == 0 {
p(fmt.Fprintln(fp, "##META text tag ="))
} else {
for _, tag := range tags {
p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
}
}
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
for _, par := range pars {
p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
}
p(fp.Close())
p(os.Chtimes(filename+".txt", timestamp, timestamp))
return true
}

70
Volkskrant/txt2corpus.sh Executable file
View File

@@ -0,0 +1,70 @@
#!/bin/bash
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Volkskrant
unset CDPATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -2days +%Y-%m-%d`
else
case "$1" in
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1
;;
*)
echo INVALID
exit 1
;;
esac
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
then
echo Getting lock failed
exit 1
fi
rm -fr out
mkdir out
rm -f $corpus.lines
for i in *.txt
do
b=`basename $i .txt`
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
| perl -e '$n = 0; while(<>) { $n++; print("vk.'$b'.$n|$_"); }' \
>> $corpus.lines
done
cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out
rm -f lock

View File

@@ -7,6 +7,7 @@ import (
"encoding/json"
"fmt"
"os"
"regexp"
"strconv"
"strings"
"time"
@@ -49,13 +50,14 @@ var (
parts = map[string]struct {
file string
suffix string
re *regexp.Regexp
}{
"nieuwe namen": {"nieuwe-namen", ".t20"},
"nieuwe woorden": {"nieuwe-woorden-extra", ".t20"},
"personen": {"personen", ""},
"andere namen": {"overige-namen", ""},
"locaties": {"locaties", ""},
"organisaties": {"organisaties", ""},
"nieuwe namen": {"nieuwe-namen", ".t20", nil},
"nieuwe woorden": {"nieuwe-woorden-extra", ".t20", nil},
"personen": {"personen", "", nil},
"andere namen": {"overige-namen", "", nil},
"locaties": {"locaties", "", nil},
"organisaties": {"organisaties", "", regexp.MustCompile(`^(ANP|AT5)`)},
}
maanden = strings.Fields("x januari februari maart april mei juni juli augustus september oktober november december")
@@ -70,9 +72,9 @@ var (
func main() {
aa := strings.Split(os.Args[1], "-")
aa := strings.Split(os.Args[1], ".")
if len(aa) != 2 {
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
}
var err error
@@ -128,7 +130,8 @@ func makeParts(source string) *Parts {
func makeValues(source, part string) [][5]any {
v := make([][5]any, 0)
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%s-%s-%d-%02d-%d%s",
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%d/%s-%s-%d.%02d-%d%s",
year,
sources[source],
parts[part].file,
year,
@@ -141,12 +144,15 @@ func makeValues(source, part string) [][5]any {
scanner := bufio.NewScanner(fp)
lineno := 0
for scanner.Scan() {
lineno++
line := scanner.Text()
aa := strings.Split(line, "\t")
count, err := strconv.Atoi(strings.TrimSpace(aa[0]))
x(err)
word := aa[1]
if parts[part].re != nil && parts[part].re.MatchString(word) {
continue
}
lineno++
var tags, lemma, postag string
if len(aa) > 2 {
tags = aa[2]
@@ -226,7 +232,7 @@ func dates() (start, first, last string, names []string) {
t3 := tStart
for range size {
y, w := t3.ISOWeek()
names = append(names, fmt.Sprintf("%d/%02d", y, w))
names = append(names, fmt.Sprintf("%d/w%02d", y, w))
t3 = t3.AddDate(0, 0, 7)
}
t3 = tStart

View File

@@ -26,13 +26,24 @@ var (
func main() {
files, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
dirs, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
x(err)
for _, dir := range dirs {
if !dir.IsDir() {
continue
}
dirname := dir.Name()
if dirname[0] != '2' {
continue
}
files, err := os.ReadDir("/net/corpora/nlnieuws/data/json/" + dirname)
x(err)
for _, file := range files {
filename := file.Name()
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
addWeek(filename[5:12])
for _, file := range files {
filename := file.Name()
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
addWeek(filename[5:12])
}
}
}

63
cmd/rang/rang.go Normal file
View File

@@ -0,0 +1,63 @@
package main
// alto 'fp://node[....]' 'tt:%w\t%I' $files | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang
import (
e "codeberg.org/pebbe/errors"
"bufio"
"fmt"
"os"
"sort"
"strings"
)
var (
x = e.ExitErr
)
type Item struct {
word string
count int
}
func main() {
counts := make(map[string]int)
scanner := bufio.NewScanner(os.Stdin)
for scanner.Scan() {
word := strings.Split(scanner.Text(), "\t")[0]
counts[word] = counts[word] + 1
}
x(scanner.Err())
items := make([]Item, 0)
for key, value := range counts {
items = append(items, Item{
word: key,
count: value,
})
}
sort.Slice(items, func(a, b int) bool {
if items[a].count == items[b].count {
return items[a].word < items[b].word
}
return items[a].count > items[b].count
})
rang := 0
prev := 0
for _, item := range items {
if item.count < 2 {
break
}
if item.count != prev {
rang++
prev = item.count
}
fmt.Printf("%d\t%s\n", rang, item.word)
}
}

View File

@@ -1,114 +0,0 @@
package main
import (
e "codeberg.org/pebbe/errors"
"bufio"
"fmt"
"os"
"regexp"
"sort"
"strconv"
"strings"
)
type Item struct {
text string
lctext string
score int
isnew bool
}
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`(.*)([12][0-9][0-9][0-9]-[0-5][0-9])(.*)`)
count = make(map[string]int)
items = make([]Item, 0)
)
func main() {
filename := os.Args[1]
prevname := getPrev(filename)
fp, err := os.Open(prevname)
x(err)
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
aa := strings.SplitN(scanner.Text(), "\t", 2)
n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
x(err)
count[aa[1]] = n
}
x(scanner.Err())
x(fp.Close())
fp, err = os.Open(filename)
x(err)
scanner = bufio.NewScanner(fp)
for scanner.Scan() {
aa := strings.SplitN(scanner.Text(), "\t", 2)
n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
x(err)
n1, ok := count[aa[1]]
items = append(items, Item{
text: aa[1],
lctext: strings.ToLower(aa[1]),
score: n - n1,
isnew: !ok,
})
}
x(scanner.Err())
x(fp.Close())
sort.Slice(items, func(i, j int) bool {
/*
if items[i].isnew && !items[j].isnew {
return true
}
if !items[i].isnew && items[j].isnew {
return false
}
*/
if items[i].score != items[j].score {
return items[i].score > items[j].score
}
return items[i].lctext < items[j].lctext
})
for _, item := range items {
/*
if item.score < 2 {
break
}
*/
p := "."
if item.isnew {
p = "N"
}
fmt.Printf("%s\t%4d\t%s\n", p, item.score, item.text)
}
}
func getPrev(filename string) string {
mm := reYearWeek.FindStringSubmatch(filename)
year, err := strconv.Atoi(mm[2][:4])
x(err)
week, err := strconv.Atoi(mm[2][5:])
x(err)
week--
if week == 0 {
week = 53
year--
}
newname := fmt.Sprintf("%s%d-%02d%s", mm[1], year, week, mm[3])
if week == 53 {
_, err := os.Stat(newname)
if err == nil {
return newname
}
newname = fmt.Sprintf("%s%d-%02d%s", mm[1], year, week-1, mm[3])
}
return newname
}

View File

@@ -11,7 +11,7 @@ import (
var (
x = e.ExitErr
reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]-[0-5][0-9])(.*)`)
reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]\.[0-5][0-9])(.*)`)
seen = make(map[string]bool)
)
@@ -23,21 +23,30 @@ func main() {
suffix := m[3] + ".t20"
target := infile + ".t20"
x(os.Chdir("/net/corpora/nlnieuws/data"))
files, err := os.ReadDir(".")
dirs, err := os.ReadDir("..")
x(err)
for _, file := range files {
name := file.Name()
if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target {
fp, err := os.Open(name)
x(err)
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
seen[strings.Split(scanner.Text(), "\t")[1]] = true
for _, dir := range dirs {
if !dir.IsDir() {
continue
}
dirname := dir.Name()
if dirname[0] != '2' {
continue
}
files, err := os.ReadDir("../" + dirname)
x(err)
for _, file := range files {
name := file.Name()
if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target {
fp, err := os.Open("../" + dirname + "/" + name)
x(err)
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
seen[strings.Split(scanner.Text(), "\t")[1]] = true
}
x(scanner.Err())
x(fp.Close())
}
x(scanner.Err())
x(fp.Close())
}
}

108
cmd/trends/trends.go Normal file
View File

@@ -0,0 +1,108 @@
package main
import (
e "codeberg.org/pebbe/errors"
"bufio"
"fmt"
"os"
"sort"
"strconv"
"strings"
)
type Item struct {
word string
diff float64
gone bool
}
var (
x = e.ExitErr
)
func main() {
refs := make(map[string]int)
refmax := 0
fp, err := os.Open(os.Args[1])
x(err)
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
aa := strings.Split(scanner.Text(), "\t")
n, err := strconv.Atoi(aa[0])
x(err)
refs[aa[1]] = n
if n > refmax {
refmax = n
}
}
x(scanner.Err())
fp.Close()
refmax++
lines := make([]string, 0)
fp, err = os.Open(os.Args[2])
x(err)
scanner = bufio.NewScanner(fp)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
x(scanner.Err())
fp.Close()
curmax, err := strconv.Atoi(strings.Split(lines[len(lines)-1], "\t")[0])
x(err)
curmax++
items := make([]Item, 0)
seen := make(map[string]bool)
for _, line := range lines {
aa := strings.Split(line, "\t")
seen[aa[1]] = true
n, err := strconv.Atoi(aa[0])
x(err)
m, ok := refs[aa[1]]
if !ok {
//continue
m = refmax
}
diff := float64(m)/float64(refmax) - float64(n)/float64(curmax)
if diff > 0.05 || diff < -0.05 {
items = append(items, Item{
word: aa[1],
diff: diff,
})
}
}
for key, value := range refs {
if !seen[key] {
diff := float64(value)/float64(refmax) - 1.0
if diff > 0.05 || diff < -0.05 {
items = append(items, Item{
word: key,
diff: diff,
gone: true,
})
}
}
}
sort.Slice(items, func(a, b int) bool {
if items[a].diff == items[b].diff {
return items[a].word < items[b].word
}
return items[a].diff > items[b].diff
})
for _, item := range items {
var s string
if item.gone {
s = "X"
}
fmt.Printf("%f\t%s\t%s\n", item.diff, s, item.word)
}
}

View File

@@ -15,9 +15,9 @@ var (
)
func main() {
aa := strings.Split(os.Args[1], "-")
aa := strings.Split(os.Args[1], ".")
if len(aa) != 2 {
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
}
year, err := strconv.Atoi(aa[0])
@@ -58,7 +58,7 @@ func main() {
fmt.Print(" -or")
}
y, w := t2.ISOWeek()
fmt.Printf(" -name %d-%02d.data.dz", y, w)
fmt.Printf(" -name %d.%02d.data.dz", y, w)
t2 = t2.AddDate(0, 0, 7)
}

View File

@@ -22,10 +22,10 @@ say () {
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
@@ -35,11 +35,15 @@ else
esac
fi
cd /net/corpora/nlnieuws/data
year=${ds%%.*}
mkdir -p /net/corpora/nlnieuws/data/$year
mkdir -p /net/corpora/nlnieuws/data/json/$year
cd /net/corpora/nlnieuws/data/$year
declare -A parts
#parts[alles]='.'
parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso'
parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso|Volkskrant'
parts[amsterdam]='AT5|BuurtAdam|Parool'
parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom'
parts[literatuur]='LitNL|Tzum'
@@ -60,6 +64,7 @@ parts[vlaanderen]='HLN|VRT'
#parts[Sargasso]='Sargasso'
#parts[Sikkom]='Sikkom'
#parts[Tzum]='Tzum'
#parts[Volkskrant]='Volkskrant'
#parts[VRT]='VRT'
for part in ${!parts[@]}
@@ -68,7 +73,7 @@ do
for i in 1 4
do
files=$(find .. $(week2files $ds $i) | grep -E "$regex") || true
files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true
if [ -z "$files" ]
then
continue
@@ -158,10 +163,23 @@ do
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
> $part-nieuwe-adjww-extra-$ds-$i
# ranglijsten
say $part-rang-$ds-$i
alto \
'fp://node[((@pt="n" or @neclass) and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass])]' \
'tt:%w\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
> $part-rang-$ds-$i
done
done
data2json $ds 1 > json/DATA-$ds-1.json
data2json $ds 4 > json/DATA-$ds-4.json
dates2json > json/index1.json
dates2json > json/index4.json
data2json $ds 1 > ../json/$year/DATA-$ds-1.json
data2json $ds 4 > ../json/$year/DATA-$ds-4.json
dates2json > ../json/index1.json
dates2json > ../json/index4.json
# rechten bijwerken
chmod -R g+w /net/corpora/nlnieuws
chgrp -R software /net/corpora/nlnieuws

4
go.mod
View File

@@ -1,11 +1,13 @@
module nlnieuws
module git.web.rug.nl/p209327/nlnieuws
go 1.26.1
require (
codeberg.org/pebbe/errors v0.4.0
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5
github.com/pebbe/compactcorpus v1.0.3
github.com/pebbe/textcat/v2 v2.3.0
github.com/rug-compling/alpinods v1.18.1
)
require github.com/pebbe/util v0.9.0 // indirect

4
go.sum
View File

@@ -2,7 +2,11 @@ codeberg.org/pebbe/errors v0.4.0 h1:G05wsXpC/LRPaL02QYDwtz0sWFWQcIWK1s+MC79LBzU=
codeberg.org/pebbe/errors v0.4.0/go.mod h1:O7PPxUJM1bWRHq11CRK3wqVaH/3NnRaSVZvh3UhzDCY=
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4=
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw=
github.com/pebbe/compactcorpus v1.0.3 h1:6qlfXKHTKg7oWKLPCgEgv1scplfvphg/9l9XiRT2HzQ=
github.com/pebbe/compactcorpus v1.0.3/go.mod h1:SSpTeCZataCjjs82RJb8SOGdjkB3PlR7Z19EY4rInoQ=
github.com/pebbe/textcat/v2 v2.3.0 h1:RB2egIQgI2a2Ls+I9No6KFQKCZBIFt8Cc/SWCnVtC7Y=
github.com/pebbe/textcat/v2 v2.3.0/go.mod h1:WLXWuL+fOlQJqn6LmubjD+e78hCC6Y/rAWInh0wq/kg=
github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA=
github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q=
github.com/rug-compling/alpinods v1.18.1 h1:BvPcCnNEQ1QoVSc0RmwJd3kZmvo4iqZ52/vFzVvFS7w=
github.com/rug-compling/alpinods v1.18.1/go.mod h1:R3BBX8RIw9InVqHZ+1W+MsX8WX8uBkoVNNGE38mqF1Q=

74
internal/util/util.go Normal file
View File

@@ -0,0 +1,74 @@
package util
import (
e "codeberg.org/pebbe/errors"
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
)
var (
p = e.PanicErr
reEOL = regexp.MustCompile(`[.!?]['"”’]?$`)
reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}+\.?`)
reLET = regexp.MustCompile(`\p{Lu}`)
reBody = regexp.MustCompile(`<[bB][rR][ /]*>`)
reQuotLeft = regexp.MustCompile(`<em>|<i>`)
reQuotRight = regexp.MustCompile(`</em>|</i>`)
)
func HtmlFix(html []byte) []byte {
html = reQuotLeft.ReplaceAllLiteral(html, []byte(" „"))
html = reQuotRight.ReplaceAllLiteral(html, []byte("” "))
return reBody.ReplaceAllLiteral(html, []byte(" "))
}
func HtmlFixString(html string) string {
html = reQuotLeft.ReplaceAllLiteralString(html, " „")
html = reQuotRight.ReplaceAllLiteralString(html, "” ")
return reBody.ReplaceAllLiteralString(html, " ")
}
func AddEnd(s string) string {
s = strings.TrimSpace(s)
if s == "" {
return ""
}
if reEOL.MatchString(s) {
return s + "\n"
}
return s + ".\n"
}
func FixSpace(s string, opt ...bool) string {
s = strings.Join(strings.Fields(s), " ")
if len(opt) > 0 && opt[0] {
s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string {
if strings.HasSuffix(s1, ".") {
// zoals: v.Chr.
return s1
}
i := reLET.FindStringIndex(s1)[0]
return s1[:i] + " " + s1[i:]
})
}
return s
}
func MkLock(filename string) {
pid := os.Getpid()
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
p(os.Symlink(link, filename))
name, err := os.Readlink(filename)
p(err)
if name != link {
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
}
}

66
oud/fix.go Normal file
View File

@@ -0,0 +1,66 @@
package main
/*
Dit past corpora aan
Tags verwijderen:
Oog: Nieuws
Parool: Nieuws
RO: Artikelen, cafeyn
RTVNoord: br_*
Tzum: Nieuws
Tags veranderen:
RTVNoord: tr_* → *
*/
import (
e "codeberg.org/pebbe/errors"
cc "github.com/pebbe/compactcorpus"
"github.com/rug-compling/alpinods"
"encoding/xml"
"fmt"
"os"
"strings"
)
var (
x = e.ExitErr
)
func main() {
for _, file := range os.Args[1:] {
base := strings.TrimSuffix(file, ".data.dz")
newfile := base + "-new.data.dz"
incc, err := cc.Open(file)
x(err)
outcc, err := cc.NewCorpus(newfile)
x(err)
r, err := incc.NewRange()
x(err)
for r.HasNext() {
name, data := r.Next()
fmt.Printf("%s %s \r", base, name)
var alpino alpinods.AlpinoDS
x(xml.Unmarshal(data, &alpino))
for i := 0; i < len(alpino.Metadata.Meta); i++ {
if alpino.Metadata.Meta[i].Name != "tag" {
continue
}
if n := alpino.Metadata.Meta[i].Value; n == "Nieuws" || n == "Artikelen" || n == "cafeyn" || strings.HasPrefix(n, "br_") {
alpino.Metadata.Meta = append(alpino.Metadata.Meta[:i], alpino.Metadata.Meta[i+1:]...)
i--
} else if strings.HasPrefix(n, "tr_") {
alpino.Metadata.Meta[i].Value = n[3:]
}
}
outcc.Write(name, []byte(alpino.String()))
}
x(outcc.Close())
}
}

View File

@@ -15,7 +15,7 @@ gebruik:
1 : nieuwe namen
2 : nieuwe woorden
3 : nieuwe woorden met postag en lemma
4 : bestaaande locaties
4 : bestaande locaties
5 : bestaande personen
6 : bestaande organisaties
7 : bestaande andere namen

View File

@@ -1,232 +0,0 @@
Vragen:
- hoe data range selecteren (bv alles van maart 2026)
- website met lijstjes top-N (20?)
- nieuwe namen
- wel of niet onderverdelen naar categorie?
- nieuwe woorden
- met postag
- bestaande namen
- personen
- plaatsen
- organisaties
- misc
- queries worden nog beetje aangepast denk ik
"nieuw": nu: niet in Alpino, later (ook): niet in top-N van vorige maand.
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' tt:%w |sort | uniq -c |sort -nr | head -n 20
"nieuwe namen"
445 Straat van Hormuz
433 Jetten
309 AI
301 Høiby
250 Odido
190 Zelensky
174 Rob Jetten
153 VRT NWS
134 Jeffrey Epstein
130 Anthropic
125 Schulting
115 GroenLinks-PvdA
109 TikTok
106 Xandra Velzeboer
106 Kyiv
106 JA21
104 Starmer
98 Marius Borg Høiby
95 Revolutionaire Garde
94 Jens van 't Wout
"nieuwe woorden":
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' tt:%w |sort | uniq -c |sort -nr |head -n 20
150 Trump-regering
141 coalitieakkoord
126 zeestraat
122 Golfregio
107 massastart
96 Amerikaans-Israëlische
92 ballistische
90 datalek
85 kabinet-Jetten
82 lng
74 droneaanval
68 vergeldingsaanvallen
61 tussenronde
59 Iranoorlog
58 vrijgave
56 speelzand
55 regering-Trump
54 sprintrace
54 ploegenachtervolging
liever met postag en lemma erbij:
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' tt:"%w \t %l \t %P" |sort | uniq -c |sort -nr |head -n 20
150 Trump-regering Trump_regering N(soort,ev,basis,zijd,stan)
141 coalitieakkoord coalitie_akkoord N(soort,ev,basis,onz,stan)
126 zeestraat zee_straat N(soort,ev,basis,zijd,stan)
121 Golfregio Golf_regio N(soort,ev,basis,zijd,stan)
107 massastart massa_start N(soort,ev,basis,zijd,stan)
96 Amerikaans-Israëlische Amerikaans_Israëlisch ADJ(prenom,basis,met-e,stan)
90 datalek data_lek N(soort,ev,basis,onz,stan)
90 ballistische ballistisch ADJ(prenom,basis,met-e,stan)
82 lng lng N(soort,ev,basis,onz,stan)
74 droneaanval drone_aanval N(soort,ev,basis,zijd,stan)
72 kabinet-Jetten kabinet-Jetten N(soort,ev,basis,onz,stan)
66 vergeldingsaanvallen vergelding_aanval N(soort,mv,basis)
61 tussenronde tussen_ronde N(soort,ev,basis,zijd,stan)
59 Iranoorlog Iran_oorlog N(soort,ev,basis,zijd,stan)
56 speelzand speel_zand N(soort,ev,basis,onz,stan)
55 regering-Trump regering_Trump N(soort,ev,basis,zijd,stan)
54 vrijgave vrij_gave N(soort,ev,basis,zijd,stan)
54 sprintrace sprint_race N(soort,ev,basis,zijd,stan)
54 ploegenachtervolging ploeg_achtervolging N(soort,ev,basis,zijd,stan)
53 staatsmedia staat_medium N(soort,mv,basis)
"bestaande locaties":
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
3910 Iran
2180 Nederland
1929 VS
1610 Israël
1218 Midden-Oosten
1128 Oekraïne
942 Verenigde Staten
874 Rusland
823 Amsterdam
776 Europa
668 DEN HAAG
563 België
555 China
445 Milaan
429 Frankrijk
389 Duitsland
380 Brussel
374 Dubai
368 Libanon
364 Groningen
"bestaande personen":
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
1812 Trump
531 Donald Trump
327 Khamenei
309 Epstein
267 Verstappen
229 Andrew
208 Máxima
187 Ali Khamenei
161 Orbán
146 Trumps
133 Mette-Marit
133 Keijzer
126 Willem-Alexander
126 Kok
122 Charles
118 Stolz
113 Harald
111 Poetin
97 Van Persie
94 Wilders
"bestaande organisaties":
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
2575 ANP
547 Ajax
449 Instagram
421 EU
357 Defensie
349 Feyenoord
348 D66
346 VVD
329 PSV
305 Hezbollah
303 Tweede Kamer
303 NEC
296 AZ
265 CDA
263 OM
237 NU.nl
232 NOS
231 BBC
224 Kamer
219 Openbaar Ministerie
"bestaande andere namen (boeken, films, events, .. )":
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
361 Spelen
289 Olympische Spelen
278 Eredivisie
244 X
222 Winterspelen
177 Champions League
147 Formule 1
143 Premier League
137 X.
112 Oscars
102 Grand Prix
100 Paralympische Spelen
90 Facebook
78 Eurovisie Songfestival
76 WhatsApp
75 Parijs-Nice
70 Tweede Wereldoorlog
67 Oscar
66 The New York Times
62 AEX-index
/* deze misschien niet? */
"nieuwe adjectieven, deelwoorden en werkwoorden":
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' tt:"%w %P" |sort | uniq -c |sort -nr |head -n 20
96 Amerikaans-Israëlische ADJ(prenom,basis,met-e,stan)
90 ballistische ADJ(prenom,basis,met-e,stan)
41 radicaal-rechtse ADJ(prenom,basis,met-e,stan)
29 Israëlisch-Amerikaanse ADJ(prenom,basis,met-e,stan)
27 pro-Iraanse ADJ(prenom,basis,met-e,stan)
25 Belarussische ADJ(prenom,basis,met-e,stan)
22 radicaal-linkse ADJ(prenom,basis,met-e,stan)
21 Omaanse ADJ(prenom,basis,met-e,stan)
19 pro-Palestijnse ADJ(prenom,basis,met-e,stan)
16 partijloze ADJ(prenom,basis,met-e,stan)
15 Eindhovense ADJ(prenom,basis,met-e,stan)
14 cybercriminele ADJ(prenom,basis,met-e,stan)
14 bestverkochte WW(vd,prenom,met-e)
12 onbevestigde WW(vd,prenom,met-e)
12 kindgebonden WW(vd,prenom,zonder)
12 AI-gegenereerde WW(vd,prenom,met-e)
11 toekomstbestendig ADJ(vrij,basis,zonder)
11 omhooggegaan WW(vd,vrij,zonder)
11 Iraans-Koerdische ADJ(prenom,basis,met-e,stan)
11 antifascistische ADJ(prenom,basis,met-e,stan)

View File

@@ -74,7 +74,8 @@
<div class="option" id="week">
week:
<input type="date" id="fDate" name="date" step="7" />
<input type="date" id="fDate" name="date" step="7" /><span
class="validity"></span>
</div>
<button type="button" onclick="kies()" id="fSubmit" disabled>
@@ -109,11 +110,16 @@
<td class="bar"><div id="RO"></div></td>
<td><a href="https://reportersonline.nl/">Reporters Online</a></td>
</tr>
<tr class="last">
<tr>
<td></td>
<td class="bar"><div id="Sargasso"></div></td>
<td><a href="https://sargasso.nl/">Sargasso</a></td>
</tr>
<tr class="last">
<td></td>
<td class="bar"><div id="Volkskrant"></div></td>
<td><a href="https://www.volkskrant.nl/">de Volkskrant</a></td>
</tr>
<tr class="first">
<td>Amsterdam</td>
<td class="bar"><div id="AT5"></div></td>
@@ -121,16 +127,20 @@
</tr>
<tr>
<td></td>
<td class="bar"><div id="BuurtAdam"></div></td>
<td class="bar"><div id="Parool"></div></td>
<td>
<a href="https://indebuurt.nl/amsterdam/">In de buurt Amsterdam</a>
<a href="https://www.parool.nl/amsterdam/"
>Het Parool | Amsterdam</a
>
</td>
</tr>
<tr class="last">
<td></td>
<td class="bar"><div id="Parool"></div></td>
<td class="bar"><div id="BuurtAdam"></div></td>
<td>
<a href="https://www.parool.nl/amsterdam/">Parool Amsterdam</a>
<a href="https://indebuurt.nl/amsterdam/"
>In de buurt | Amsterdam</a
>
</td>
</tr>
<tr class="first">
@@ -146,7 +156,9 @@
<td></td>
<td class="bar"><div id="BuurtGrn"></div></td>
<td>
<a href="https://indebuurt.nl/groningen/">In de buurt Groningen</a>
<a href="https://indebuurt.nl/groningen/"
>In de buurt | Groningen</a
>
</td>
</tr>
<tr>

View File

@@ -133,7 +133,9 @@ function makeTD(title, values) {
async function loadSource(source, week) {
if (!data.has(week)) {
data[week] = await getJSON('DATA-' + week + '-4.json')
data[week] = await getJSON(
week.substring(0, 4) + '/DATA-' + week + '-4.json',
)
}
idSubtitle.innerHTML = source + ' — t/m ' + data[week].last
@@ -154,7 +156,9 @@ async function loadSource(source, week) {
async function loadPart(part, week) {
if (!data.has(week)) {
data[week] = await getJSON('DATA-' + week + '-4.json')
data[week] = await getJSON(
week.substring(0, 4) + '/DATA-' + week + '-4.json',
)
}
idSubtitle.innerHTML = part + ' — t/m ' + data[week].last
@@ -185,7 +189,9 @@ async function loadWeken(source, part) {
if (i < dates.length) {
var week = dates[i].week
if (!data.has(week)) {
data[week] = await getJSON('DATA-' + week + '-4.json')
data[week] = await getJSON(
week.substring(0, 4) + '/DATA-' + week + '-4.json',
)
}
var values = data[week][source][part]
tr.appendChild(makeTD('t/m ' + data[week].last, values))

View File

@@ -167,3 +167,11 @@ tr.last td {
tr.first td {
padding-top: 0.5em;
}
#fDate:invalid + span::after {
content: ' ✖';
}
#fDate:valid + span::after {
content: ' ✓';
}