From a76fa215848486ef3198c58d9c06d8e23fcd6f87 Mon Sep 17 00:00:00 2001 From: Peter Kleiweg Date: Fri, 5 Jun 2026 16:05:46 +0200 Subject: [PATCH] update --- .gitignore | 2 + Makefile | 1 + Parool/cmd/parool/parool.go | 8 +- README.md | 2 + Tzum/cmd/xml2txt/xml2txt.go | 23 +- Volkskrant/Makefile | 9 + Volkskrant/cmd/metadata/metadata.go | 131 ++++++++ Volkskrant/cmd/volkskrant/volkskrant.go | 387 ++++++++++++++++++++++++ Volkskrant/txt2corpus.sh | 70 +++++ collect.sh | 3 +- internal/util/util.go | 24 +- www/app.html | 23 +- 12 files changed, 645 insertions(+), 38 deletions(-) create mode 100644 Volkskrant/Makefile create mode 100644 Volkskrant/cmd/metadata/metadata.go create mode 100644 Volkskrant/cmd/volkskrant/volkskrant.go create mode 100755 Volkskrant/txt2corpus.sh diff --git a/.gitignore b/.gitignore index 795f7f6..713ab78 100644 --- a/.gitignore +++ b/.gitignore @@ -38,6 +38,8 @@ Sikkom/sikkom Tzum/metadata Tzum/tzum Tzum/xml2txt +Volkskrant/metadata +Volkskrant/volkskrant VRT/metadata VRT/vrt bin/data2json diff --git a/Makefile b/Makefile index 3fd7538..0513cd9 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,7 @@ all: make -C Sargasso make -C Sikkom make -C Tzum + make -C Volkskrant make -C VRT make bin/data2json make bin/dates2json diff --git a/Parool/cmd/parool/parool.go b/Parool/cmd/parool/parool.go index 53e5540..b446953 100644 --- a/Parool/cmd/parool/parool.go +++ b/Parool/cmd/parool/parool.go @@ -280,6 +280,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n } header := headers[0] + isVideo := false tags := make([]string, 0) ell, err := header.Search(`.//*[@data-test-id="article-label"]`) p(err) @@ -291,6 +292,9 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n if s != "" && s != "Nieuws" { tags = append(tags, s) } + if strings.ToLower(s) == "video" { + isVideo = true + } } pars := make([]string, 0) @@ -344,7 +348,9 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n } } if !found { - _ = w(fmt.Errorf("no text, skipping: %s", url)) + if !isVideo { + _ = w(fmt.Errorf("no text, skipping: %s", url)) + } fp, err := os.Create(filename + ".skip") p(fp.WriteString(url + "\n")) p(err) diff --git a/README.md b/README.md index 1db09c5..fa95f72 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ crontab van p209327@colossus 17 * * * * /net/corpora/nlnieuws/Sikkom/sikkom 18 * * * * /net/corpora/nlnieuws/Tzum/tzum 19 * * * * /net/corpora/nlnieuws/VRT/vrt +20 * * * * /net/corpora/nlnieuws/Volkskrant/volkskrant ``` ## 2. Teksten verwerken: omzetten naar zinnen, parsen, metadata toevoegen @@ -53,6 +54,7 @@ crontab van p209327@colossus 0 1 * * * /net/corpora/nlnieuws/NU/txt2corpus.sh 0 1 * * * /net/corpora/nlnieuws/NieuwsNL/txt2corpus.sh 0 1 * * * /net/corpora/nlnieuws/VRT/txt2corpus.sh +0 1 * * * /net/corpora/nlnieuws/Volkskrant/txt2corpus.sh # weinig data: alleen op dinsdag 0 1 * * 2 /net/corpora/nlnieuws/AT5/txt2corpus.sh 0 1 * * 2 /net/corpora/nlnieuws/BuurtAdam/txt2corpus.sh diff --git a/Tzum/cmd/xml2txt/xml2txt.go b/Tzum/cmd/xml2txt/xml2txt.go index 3d8e7c6..d8e8b47 100644 --- a/Tzum/cmd/xml2txt/xml2txt.go +++ b/Tzum/cmd/xml2txt/xml2txt.go @@ -24,8 +24,6 @@ var ( x = e.ExitErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`) - reEM = regexp.MustCompile(`::EM::.*?::/EM::`) - reTitle = regexp.MustCompile(`^\p{Lu}`) ) func main() { @@ -68,7 +66,7 @@ func main() { x(fmt.Fprintf(fp, "##META text tag = %s\n", t)) } x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) - doc, err := gokogiri.ParseHtml([]byte(`` + em1(u.HtmlFixString(item.Text)) + ``)) + doc, err := gokogiri.ParseHtml([]byte(`` + u.HtmlFixString(item.Text) + ``)) x(err) root := doc.Root() pp, err := root.Search(`//body/p`) @@ -76,26 +74,9 @@ func main() { for _, p := range pp { s := p.Content() if !strings.Contains(s, "verscheen eerst op Tzum.") { - x(fp.WriteString(em2(u.AddEnd(u.FixSpace(p.Content()))))) + x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content())))) } } x(fp.Close()) } } - -func em1(s string) string { - return strings.ReplaceAll( - strings.ReplaceAll(s, "", " ::EM::"), - "", - "::/EM:: ") -} - -func em2(s string) string { - return reEM.ReplaceAllStringFunc(s, func(s1 string) string { - s1 = s1[6 : len(s1)-7] - if reTitle.MatchString(s1) { - return `"` + s1 + `"` - } - return s1 - }) -} diff --git a/Volkskrant/Makefile b/Volkskrant/Makefile new file mode 100644 index 0000000..841a156 --- /dev/null +++ b/Volkskrant/Makefile @@ -0,0 +1,9 @@ +all: \ + metadata \ + volkskrant + +metadata: cmd/metadata/*.go + go build -o $@ $^ + +volkskrant: cmd/volkskrant/*.go ../internal/util/*.go + go build -o $@ $< diff --git a/Volkskrant/cmd/metadata/metadata.go b/Volkskrant/cmd/metadata/metadata.go new file mode 100644 index 0000000..49f82c7 --- /dev/null +++ b/Volkskrant/cmd/metadata/metadata.go @@ -0,0 +1,131 @@ +package main + +import ( + e "codeberg.org/pebbe/errors" + + "bufio" + "encoding/xml" + "fmt" + "html" + "os" + "strings" + "time" +) + +type Item struct { + XMLName xml.Name `xml:"item"` + UnixTime int64 `xml:"unixTime"` +} + +var ( + x = e.ExitErr + escape = html.EscapeString + data = make(map[string][]string) + location *time.Location +) + +func main() { + var err error + location, err = time.LoadLocation("Europe/Amsterdam") + x(err) + + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText("", filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml("", filename) + } + } + files, err = os.ReadDir("..") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText("../", filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml("../", filename) + } + } + + files, err = os.ReadDir("xml") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + aa := strings.Split(filename, ".") + base := strings.Join(aa[1:len(aa)-2], ".") + b, err := os.ReadFile("xml/" + filename) + x(err) + s := string(b) + i := strings.Index(s, "\n \n")) + for _, m := range data[base] { + x(fp.WriteString(" " + m + "\n")) + } + x(fp.WriteString(" \n ")) + x(fp.WriteString(stripMeta(s[i:]))) + x(fp.Close()) + x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) + } +} + +func doText(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + fp, err := os.Open(dirname + filename) + x(err) + defer func() { x(fp.Close()) }() + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + line := scanner.Text() + if !strings.HasPrefix(line, "##META") { + continue + } + aa := strings.Fields(line) + if len(aa) > 4 { + data[base] = append(data[base], + fmt.Sprintf(``, + aa[1], + escape(aa[2]), + escape(strings.Join(aa[4:], " ")))) + } + } + x(scanner.Err()) +} + +func doXml(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + b, err := os.ReadFile(dirname + filename) + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + t := time.Unix(item.UnixTime, 0).In(location) + data[base] = append(data[base], + fmt.Sprintf(``, + t.Year(), + int(t.Month()), + t.Day())) +} + +func stripMeta(s string) string { + i1 := strings.Index(s, "") + if i1 < 0 { + return s + } + i2 := i1 + strings.Index(s[i1:], "") + 11 + return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n") +} diff --git a/Volkskrant/cmd/volkskrant/volkskrant.go b/Volkskrant/cmd/volkskrant/volkskrant.go new file mode 100644 index 0000000..121b69d --- /dev/null +++ b/Volkskrant/cmd/volkskrant/volkskrant.go @@ -0,0 +1,387 @@ +package main + +import ( + e "codeberg.org/pebbe/errors" + "github.com/jbowtie/gokogiri" + + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + + //"encoding/json" + "encoding/xml" + "fmt" + //"html" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"rss"` + Items []ItemT `xml:"channel>item"` +} + +type ItemT struct { + PubDate string `xml:"pubDate"` + UnixTime int64 `xml:"unixTime"` + Guid string `xml:"guid"` + Link string `xml:"link"` + Title string `xml:"title"` + Data []byte `xml:",innerxml"` +} + +/* +type GraphT struct { + Graph []map[string]any `json:"@graph"` +} +*/ + +var ( + p = e.PanicErr + w = e.WarnErr + agent = "AhrefsBot/7.0" +) + +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + +func fileDate(filename string) string { + b, err := os.ReadFile(filename) + if err != nil { + return "" + } + s := string(b) + i1 := strings.Index(s, "") + 10 + i2 := strings.Index(s, "") + if i2 < i1 { + return "" + } + return s[i1:i2] +} + +func main() { + defer func() { + if e.Panicked { + _ = recover() + os.Exit(1) + } + }() + + myLock := "/net/corpora/nlnieuws/Volkskrant/lock" + u.MkLock(myLock) + defer func() { + _ = os.Remove(myLock) + }() + + req, err := http.NewRequest("GET", "https://www.volkskrant.nl/rss.xml", nil) + p(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + p(err) + body, err := io.ReadAll(resp.Body) + p(err) + p(resp.Body.Close()) + + var rss Rss + p(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + p(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + t, err := time.Parse(time.RFC1123Z, item.PubDate) + if err != nil { + t, err = time.Parse(time.RFC1123, item.PubDate) + } + p(err) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/Volkskrant/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day()) + if exists(dirname + "/lock") { + continue + } + basename := item.Guid + filename := dirname + "/" + url.PathEscape(basename) + + ts := fmt.Sprintf("%d", t.Unix()) + needUpdate := fileDate(filename+".xml") != ts + + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if e.Panicked { + fmt.Fprintln(os.Stderr, "----", filename) + fmt.Fprintln(os.Stderr, "----", item.Link) + } + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) + ok = doArticle(filename, item.Link, item.Title, t, needUpdate) + }() + } +} + +func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) { + if exists(filename + ".skip") { + return true + } + if needUpdate { + _ = os.Remove(filename + ".err") + _ = os.Remove(filename + ".html") + // _ = os.Remove(filename + ".json") + _ = os.Remove(filename + ".txt") + } else { + if exists(filename + ".txt") { + return true + } + } + time.Sleep(2 * time.Second) + + req, err := http.NewRequest("GET", url, nil) + p(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + p(err) + body, err := io.ReadAll(resp.Body) + p(err) + p(resp.Body.Close()) + + body = u.HtmlFix(body) + + doc, err := gokogiri.ParseHtml(body) + p(err) + + /* + + s := string(body) + + ok = true + i1 := strings.Index(s, ``) + if i2 < i1 { + ok = false + } else { + s = html.UnescapeString(s[i1:i2]) + } + } + if !ok { + _ = w(fmt.Errorf("script jsonld not found: %s", url)) + + fp, err := os.Create(filename + ".err") + p(err) + p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + + var graph GraphT + p(json.Unmarshal([]byte(s), &graph)) + for _, g := range graph.Graph { + t := g["@type"] + switch v := t.(type) { + case string: + if v == "NewsArticle" { + b, err := json.Marshal(g) + p(err) + s = string(b) + } + } + } + + fp, err := os.Create(filename + ".json") + p(err) + p(fp.WriteString(s)) + p(fp.Close()) + p(os.Chtimes(filename+".json", timestamp, timestamp)) + */ + + root := doc.Root() + + articles, err := root.Search(`//article[@id="article-content"]`) + p(err) + if len(articles) == 0 { + _ = w(fmt.Errorf("empty: %s", url)) + + fp, err := os.Create(filename + ".err") + p(err) + p(fmt.Fprintf(fp, "empty: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + article := articles[0] + + live, err := article.Search(`.//*[@data-test-id="live-blog-label"]`) + p(err) + if len(live) > 0 { + fp, err := os.Create(filename + ".skip") + p(fp.WriteString("liveblog\n")) + p(err) + p(os.Chtimes(filename+".skip", timestamp, timestamp)) + return true + } + + headers, err := article.Search(`.//header`) + p(err) + if len(headers) == 0 { + _ = w(fmt.Errorf("no header: %s", url)) + + fp, err := os.Create(filename + ".err") + p(err) + p(fmt.Fprintf(fp, "no elements: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + + } + header := headers[0] + + isOpinie := false + tags := make([]string, 0) + ell, err := header.Search(`.//*[@data-test-id="article-label"]`) + p(err) + if len(ell) == 0 { + _ = w(fmt.Errorf("no labels: %s", url)) + } + for _, el := range ell { + s := strings.TrimSpace(el.Content()) + if s != "" && s != "Nieuws" { + tags = append(tags, s) + } + if strings.ToLower(s) == "opinie" { + isOpinie = true + } + } + + pars := make([]string, 0) + + found := false + ell, err = header.Search(`.//*[@data-test-id="header-intro"]`) + p(err) + for _, el := range ell { + s := strings.TrimSpace(el.Content()) + if s != "" { + pars = append(pars, s) + found = true + } + } + if !found && !isOpinie { + _ = w(fmt.Errorf("no intro: %s", url)) + } + + specials, err := article.Search(`.//section//aside | .//section//figure | .//section//b`) + p(err) + for _, special := range specials { + special.Remove() + } + + ell, err = article.Search(`.//section//*[@data-article-element-index]`) + p(err) + if len(ell) == 0 { + _ = w(fmt.Errorf("no elements: %s", url)) + + fp, err := os.Create(filename + ".err") + p(err) + p(fmt.Fprintf(fp, "no elements: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + + found = false + for _, el := range ell { + s := strings.TrimSpace(el.Content()) + if s != "" { + pars = append(pars, s) + found = true + } + } + if !found { + _ = w(fmt.Errorf("no text, skipping: %s", url)) + fp, err := os.Create(filename + ".skip") + p(fp.WriteString(url + "\n")) + p(err) + p(os.Chtimes(filename+".skip", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) + + return true + } + + fp, err := os.Create(filename + ".txt") + p(err) + + if len(tags) == 0 { + p(fmt.Fprintln(fp, "##META text tag =")) + } else { + for _, tag := range tags { + p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag))) + } + } + + p(fp.WriteString(u.AddEnd(u.FixSpace(title)))) + + for _, par := range pars { + p(fp.WriteString(u.AddEnd(u.FixSpace(par)))) + } + + p(fp.Close()) + + p(os.Chtimes(filename+".txt", timestamp, timestamp)) + + return true +} diff --git a/Volkskrant/txt2corpus.sh b/Volkskrant/txt2corpus.sh new file mode 100755 index 0000000..eddb31d --- /dev/null +++ b/Volkskrant/txt2corpus.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +set -e + +BASE=/net/corpora/nlnieuws +PART=$BASE/Volkskrant + +unset CDPATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam +. /net/aps/etc/alpino-activate.sh > /dev/null + +if [ "$1" = "" ] +then + ds=`date -d -2days +%Y-%m-%d` +else + case "$1" in + 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +dp=${ds//-//} +year=${ds%%-*} +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year + +cd $PART/$dp + +ln -s lock.$$ lock +if [ "`readlink lock`" != lock.$$ ] +then + echo Getting lock failed + exit 1 +fi + +rm -fr out +mkdir out + +rm -f $corpus.lines +for i in *.txt +do + b=`basename $i .txt` + perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ + | perl -e '$n = 0; while(<>) { $n++; print("vk.'$b'.$n|$_"); }' \ + >> $corpus.lines +done + +cd out +mkdir xml +Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log + +metadata + +cd xml +rm -f $corpus.data.dz $corpus.index +alto -q -o $corpus.data.dz *.xml + +# telling per bericht, niet per zin +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt + +cd ../.. +rm -fr out + +rm -f lock diff --git a/collect.sh b/collect.sh index 43b5ce8..056ba1b 100755 --- a/collect.sh +++ b/collect.sh @@ -43,7 +43,7 @@ cd /net/corpora/nlnieuws/data/$year declare -A parts #parts[alles]='.' -parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso' +parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso|Volkskrant' parts[amsterdam]='AT5|BuurtAdam|Parool' parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom' parts[literatuur]='LitNL|Tzum' @@ -64,6 +64,7 @@ parts[vlaanderen]='HLN|VRT' #parts[Sargasso]='Sargasso' #parts[Sikkom]='Sikkom' #parts[Tzum]='Tzum' +#parts[Volkskrant]='Volkskrant' #parts[VRT]='VRT' for part in ${!parts[@]} diff --git a/internal/util/util.go b/internal/util/util.go index ee5a896..a1a12e1 100644 --- a/internal/util/util.go +++ b/internal/util/util.go @@ -11,19 +11,25 @@ import ( ) var ( - p = e.PanicErr - reEOL = regexp.MustCompile(`[.!?]['"”’]?$`) - reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}+\.?`) - reLET = regexp.MustCompile(`\p{Lu}`) - reBody = regexp.MustCompile(`<[bB][rR][ /]*>`) + p = e.PanicErr + reEOL = regexp.MustCompile(`[.!?]['"”’]?$`) + reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}+\.?`) + reLET = regexp.MustCompile(`\p{Lu}`) + reBody = regexp.MustCompile(`<[bB][rR][ /]*>`) + reQuotLeft = regexp.MustCompile(`|`) + reQuotRight = regexp.MustCompile(`|`) ) -func HtmlFix(body []byte) []byte { - return reBody.ReplaceAllLiteral(body, []byte(" ")) +func HtmlFix(html []byte) []byte { + html = reQuotLeft.ReplaceAllLiteral(html, []byte(" „")) + html = reQuotRight.ReplaceAllLiteral(html, []byte("” ")) + return reBody.ReplaceAllLiteral(html, []byte(" ")) } -func HtmlFixString(body string) string { - return reBody.ReplaceAllLiteralString(body, " ") +func HtmlFixString(html string) string { + html = reQuotLeft.ReplaceAllLiteralString(html, " „") + html = reQuotRight.ReplaceAllLiteralString(html, "” ") + return reBody.ReplaceAllLiteralString(html, " ") } func AddEnd(s string) string { diff --git a/www/app.html b/www/app.html index 21b8457..fa63a94 100644 --- a/www/app.html +++ b/www/app.html @@ -110,11 +110,16 @@
Reporters Online - +
Sargasso + + +
+ de Volkskrant + Amsterdam
@@ -122,16 +127,20 @@ -
+
- In de buurt Amsterdam + Het Parool | Amsterdam -
+
- Parool Amsterdam + In de buurt | Amsterdam @@ -147,7 +156,9 @@
- In de buurt Groningen + In de buurt | Groningen