diff --git a/.gitignore b/.gitignore index 77d784e..9a7719b 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,8 @@ NU/nu Oog/metadata Oog/oog Oog/xml2txt +Parool/metadata +Parool/parool RO/metadata RO/ro RO/xml2txt diff --git a/Amsterdam/Makefile b/Amsterdam/Makefile deleted file mode 100644 index ac0dfb6..0000000 --- a/Amsterdam/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -all: amsterdam - -% : %.go - go build $< diff --git a/Amsterdam/amsterdam.go b/Amsterdam/amsterdam.go deleted file mode 100644 index 063d8ea..0000000 --- a/Amsterdam/amsterdam.go +++ /dev/null @@ -1,200 +0,0 @@ -package main - -import ( - e "codeberg.org/pebbe/errors" - "github.com/jbowtie/gokogiri" - - "encoding/xml" - "fmt" - "io" - "net/http" - "net/url" - "os" - "path/filepath" - "strings" - "time" -) - -type Rss struct { - XMLName xml.Name `xml:"rss"` - Items []ItemT `xml:"channel>item"` -} - -type ItemT struct { - Title string `xml:"title"` - PubDate string `xml:"pubDate"` - UnixTime int64 `xml:"unixTime"` - Guid string `xml:"guid"` - Link string `xml:"link"` - Data []byte `xml:",innerxml"` -} - -var ( - p = e.PanicErr - agent = "AhrefsBot/7.0" - // agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36" -) - -func exists(filename string) bool { - _, err := os.Stat(filename) - return err == nil -} - -func main() { - defer func() { - if e.Panicked { - _ = recover() - os.Exit(1) - } - }() - - myLock := "/net/corpora/nlnieuws/Amsterdam/lock" - mkLock(myLock) - defer func() { - _ = os.Remove(myLock) - }() - - req, err := http.NewRequest("GET", "https://www.amsterdam.nl/nieuws/nieuwsoverzicht/?rss=true", nil) - p(err) - req.Header.Set("User-Agent", agent) - - client := &http.Client{} - resp, err := client.Do(req) - p(err) - body, err := io.ReadAll(resp.Body) - p(err) - p(resp.Body.Close()) - - var rss Rss - p(xml.Unmarshal(body, &rss)) - - if len(rss.Items) == 0 { - p(fmt.Errorf("len(rss.Items) == 0")) - } - - for _, item := range rss.Items { - t, err := time.Parse(time.RFC1123Z, item.PubDate) - if err != nil { - t, err = time.Parse(time.RFC1123, item.PubDate) - } - p(err) - dirname := fmt.Sprintf("/net/corpora/nlnieuws/Amsterdam/%d/%02d", t.Year(), int(t.Month())) - if exists(dirname + "/lock") { - continue - } - filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://www.amsterdam.nl/nieuws/")) - p(os.MkdirAll(dirname, 0777)) - func() { - var ok bool - defer func() { - if e.Panicked { - fmt.Fprintln(os.Stderr, "----", filename, "----") - } - if !ok { - _ = os.Remove(filename + ".xml") - } - }() - fp, err := os.Create(filename + ".xml") - p(err) - p(fp.WriteString("\n\n")) - p(fmt.Fprintf(fp, "%d", t.Unix())) - p(fp.Write(item.Data)) - p(fp.WriteString("\n")) - p(fp.Close()) - p(os.Chtimes(filename+".xml", t, t)) - doArticle(filename, item.Link, item.Title, t) - ok = true - }() - } -} - -func doArticle(filename string, url string, title string, timestamp time.Time) { - if exists(filename + ".txt") { - return - } - time.Sleep(2 * time.Second) - - req, err := http.NewRequest("GET", url, nil) - p(err) - req.Header.Set("User-Agent", agent) - - client := &http.Client{} - resp, err := client.Do(req) - p(err) - body, err := io.ReadAll(resp.Body) - p(err) - p(resp.Body.Close()) - - doc, err := gokogiri.ParseHtml(body) - p(err) - - root := doc.Root() - - fp, err := os.Create(filename + ".txt") - p(err) - - p(fp.WriteString(addEnd(title))) - - count := 0 - - pp, err := root.Search(`//div[@id="zone_intro"]//div[contains(@class, "inleiding")]/p`) - p(err) - for _, p1 := range pp { - p(fp.WriteString(addEnd(p1.Content()))) - count++ - } - - ell, err := root.Search(`//div[@id="zone_content"]//div[contains(@class, "tekst")]/child::*`) - p(err) - for _, el := range ell { - if n := el.Name(); n == "p" || n == "h3" { - p(fp.WriteString(addEnd(el.Content()))) - count++ - } - } - - p(fp.Close()) - - p(os.Chtimes(filename+".txt", timestamp, timestamp)) - - if count == 0 { - fp, err := os.Create(filename + ".debug.html") - p(err) - p(fp.Write(body)) - p(fp.Close()) - p(os.Chtimes(filename+".debug.html", timestamp, timestamp)) - } -} - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func mkLock(filename string) { - pid := os.Getpid() - link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) - p(os.Symlink(link, filename)) - - name, err := os.Readlink(filename) - p(err) - - if name != link { - p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) - } -} diff --git a/Makefile b/Makefile index 23b05ee..fbe6c0b 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,7 @@ all: make -C NOS make -C NU make -C Oog + make -C Parool make -C RO make -C RTVNoord make -C Sargasso diff --git a/Parool/Makefile b/Parool/Makefile new file mode 100644 index 0000000..3575e76 --- /dev/null +++ b/Parool/Makefile @@ -0,0 +1,9 @@ +all: \ + metadata \ + parool + +metadata: cmd/metadata/*.go + go build -o $@ $^ + +parool: cmd/parool/*.go + go build -o $@ $^ diff --git a/Parool/cmd/metadata/metadata.go b/Parool/cmd/metadata/metadata.go new file mode 100644 index 0000000..6427f23 --- /dev/null +++ b/Parool/cmd/metadata/metadata.go @@ -0,0 +1,131 @@ +package main + +import ( + e "codeberg.org/pebbe/errors" + + "bufio" + "encoding/xml" + "fmt" + "html" + "os" + "strings" + "time" +) + +type Item struct { + XMLName xml.Name `xml:"item"` + UnixTime int64 `xml:"unixTime"` +} + +var ( + x = e.ExitErr + escape = html.EscapeString + data = make(map[string][]string) + location *time.Location +) + +func main() { + var err error + location, err = time.LoadLocation("Europe/Amsterdam") + x(err) + + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText("", filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml("", filename) + } + } + files, err = os.ReadDir("..") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText("../", filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml("../", filename) + } + } + + files, err = os.ReadDir("xml") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + aa := strings.Split(filename, ".") + base := strings.Join(aa[1:len(aa)-2], ".") + b, err := os.ReadFile("xml/" + filename) + x(err) + s := string(b) + i := strings.Index(s, "\n \n")) + for _, m := range data[base] { + x(fp.WriteString(" " + m + "\n")) + } + x(fp.WriteString(" \n ")) + x(fp.WriteString(stripMeta(s[i:]))) + x(fp.Close()) + x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) + } +} + +func doText(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + fp, err := os.Open(dirname + filename) + x(err) + defer func() { x(fp.Close()) }() + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + line := scanner.Text() + if !strings.HasPrefix(line, "##META") { + continue + } + aa := strings.Fields(line) + if len(aa) > 4 { + data[base] = append(data[base], + fmt.Sprintf(``, + aa[1], + escape(aa[2]), + escape(strings.Join(aa[4:], " ")))) + } + } + x(scanner.Err()) +} + +func doXml(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + b, err := os.ReadFile(dirname + filename) + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + t := time.Unix(item.UnixTime, 0).In(location) + data[base] = append(data[base], + fmt.Sprintf(``, + t.Year(), + int(t.Month()), + t.Day())) +} + +func stripMeta(s string) string { + i1 := strings.Index(s, "") + if i1 < 0 { + return s + } + i2 := i1 + strings.Index(s[i1:], "") + 11 + return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n") +} diff --git a/Parool/cmd/parool/parool.go b/Parool/cmd/parool/parool.go new file mode 100644 index 0000000..1ab5502 --- /dev/null +++ b/Parool/cmd/parool/parool.go @@ -0,0 +1,318 @@ +package main + +import ( + e "codeberg.org/pebbe/errors" + "github.com/jbowtie/gokogiri" + + "encoding/xml" + "fmt" + "io" + "net/http" + "net/url" + "os" + "path/filepath" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"rss"` + Items []ItemT `xml:"channel>item"` +} + +type ItemT struct { + PubDate string `xml:"pubDate"` + UnixTime int64 `xml:"unixTime"` + Guid string `xml:"guid"` + Link string `xml:"link"` + Data []byte `xml:",innerxml"` +} + +var ( + p = e.PanicErr + w = e.WarnErr + agent = "AhrefsBot/7.0" +) + +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + +func fileDate(filename string) string { + b, err := os.ReadFile(filename) + if err != nil { + return "" + } + s := string(b) + i1 := strings.Index(s, "") + 10 + i2 := strings.Index(s, "") + if i2 < i1 { + return "" + } + return s[i1:i2] +} + +func main() { + defer func() { + if e.Panicked { + _ = recover() + os.Exit(1) + } + }() + + myLock := "/net/corpora/nlnieuws/Parool/lock" + mkLock(myLock) + defer func() { + _ = os.Remove(myLock) + }() + + req, err := http.NewRequest("GET", "https://www.parool.nl/amsterdam/rss.xml", nil) + p(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + p(err) + body, err := io.ReadAll(resp.Body) + p(err) + p(resp.Body.Close()) + + var rss Rss + p(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + p(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + t, err := time.Parse(time.RFC1123Z, item.PubDate) + if err != nil { + t, err = time.Parse(time.RFC1123, item.PubDate) + } + p(err) + year, week := t.ISOWeek() + dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/%02d", year, week) + if exists(dirname + "/lock") { + continue + } + basename := item.Guid + filename := dirname + "/" + url.PathEscape(basename) + + ts := fmt.Sprintf("%d", t.Unix()) + needUpdate := fileDate(filename+".xml") != ts + + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if e.Panicked { + fmt.Fprintln(os.Stderr, "----", filename) + fmt.Fprintln(os.Stderr, "----", item.Link) + } + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) + ok = doArticle(filename, item.Link, t, needUpdate) + }() + } +} + +func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) (ok bool) { + if exists(filename + ".skip") { + return true + } + if needUpdate { + _ = os.Remove(filename + ".err") + _ = os.Remove(filename + ".html") + _ = os.Remove(filename + ".txt") + } else { + if exists(filename + ".txt") { + return true + } + } + time.Sleep(2 * time.Second) + + req, err := http.NewRequest("GET", url, nil) + p(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + p(err) + body, err := io.ReadAll(resp.Body) + p(err) + p(resp.Body.Close()) + + doc, err := gokogiri.ParseHtml(body) + p(err) + + root := doc.Root() + + articles, err := root.Search(`//article[@id="article-content"]`) + p(err) + if len(articles) == 0 { + _ = w(fmt.Errorf("empty: %s", url)) + + fp, err := os.Create(filename + ".err") + p(err) + p(fmt.Fprintf(fp, "empty: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + article := articles[0] + + tags := make([]string, 0) + ell, err := article.Search(`//header//*[@data-test-id="article-label"]`) + p(err) + for _, el := range ell { + s := strings.TrimSpace(el.Content()) + if s != "" { + tags = append(tags, s) + } + } + + fouten := make([]string, 0) + pars := make([]string, 0) + + ell, err = article.Search(`//header//*[@data-test-id="article-title"]`) + p(err) + for _, el := range ell { + s := strings.TrimSpace(el.Content()) + if s != "" { + pars = append(pars, s) + } + } + + found := false + ell, err = article.Search(`//header//*[@data-test-id="header-intro"]`) + p(err) + for _, el := range ell { + s := strings.TrimSpace(el.Content()) + if s != "" { + pars = append(pars, s) + found = true + } + } + if !found { + fouten = append(fouten, fmt.Sprintf("no heading: %s\n", url)) + _ = w(fmt.Errorf("no heading: %s", url)) + } + + specials, err := article.Search(`//section//aside | //section//figure | //section//b`) + p(err) + for _, special := range specials { + special.Remove() + } + + found = false + ell, err = article.Search(`//section//*[@data-article-element-index]`) + p(err) + for _, el := range ell { + s := strings.TrimSpace(el.Content()) + if s != "" { + pars = append(pars, s) + found = true + } + } + if !found { + fouten = append(fouten, fmt.Sprintf("no text: %s\n", url)) + _ = w(fmt.Errorf("no text: %s", url)) + } + + if len(fouten) > 0 { + fp, err := os.Create(filename + ".err") + p(err) + for _, fout := range fouten { + p(fp.WriteString(fout)) + } + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + + fp, err := os.Create(filename + ".txt") + p(err) + + if len(tags) == 0 { + p(fmt.Fprintln(fp, "##META text tag =")) + } else { + for _, tag := range tags { + p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag))) + } + } + + for _, par := range pars { + p(fp.WriteString(addEnd(fixSpace(par)))) + } + + p(fp.Close()) + + p(os.Chtimes(filename+".txt", timestamp, timestamp)) + + return true +} + +func addEnd(s string) string { + s = strings.TrimSpace(s) + n := len(s) + if n == 0 { + return "" + } + if n > 0 { + if strings.ContainsAny(s[n-1:], ".!?") { + return s + "\n" + } + } + if n > 1 { + s2 := s[n-2:] + if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { + return s + "\n" + } + } + if strings.HasSuffix(s, `.”`) || strings.HasSuffix(s, `!”`) || strings.HasSuffix(s, `?”`) { + return s + "\n" + } + return s + ".\n" +} + +func fixSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} + +func mkLock(filename string) { + pid := os.Getpid() + link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) + p(os.Symlink(link, filename)) + + name, err := os.Readlink(filename) + p(err) + + if name != link { + p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) + } +} diff --git a/Parool/txt2corpus.sh b/Parool/txt2corpus.sh new file mode 100755 index 0000000..46b0e62 --- /dev/null +++ b/Parool/txt2corpus.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +set -e + +unset CDPATH +PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam +. /net/aps/etc/alpino-activate.sh > /dev/null + +if [ "$1" = "" ] +then + ds=`date -d -7days +%G-%V` +else + case "$1" in + 2[0-9][0-9][0-9]-[0-5][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +dp=${ds//-//} + +corpus=/net/corpora/nlnieuws/Parool/corpus/$ds + +cd /net/corpora/nlnieuws/Parool/$dp + +ln -s lock.$$ lock +if [ "`readlink lock`" != lock.$$ ] +then + echo Getting lock failed + exit 1 +fi + +rm -fr out +mkdir out + +rm -f $corpus.lines +for i in *.txt +do + b=`basename $i .txt` + perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ + | perl -e '$n = 0; while(<>) { $n++; print("parool.'$b'.$n|$_"); }' \ + >> $corpus.lines +done + +cd out +mkdir xml +Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log + +../../../metadata + +cd xml +rm -f $corpus.data.dz $corpus.index +alto -q -o $corpus.data.dz *.xml + +# telling per bericht, niet per zin +/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt + +cd ../.. +rm -fr out + +rm -f lock diff --git a/collect.sh b/collect.sh index a76ef57..fffb9f7 100755 --- a/collect.sh +++ b/collect.sh @@ -40,7 +40,7 @@ cd /net/corpora/nlnieuws/data declare -A parts #parts[alles]='.' parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso' -parts[amsterdam]='AT5|BuurtAdam' +parts[amsterdam]='AT5|BuurtAdam|Parool' parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom' parts[literatuur]='LitNL|Tzum' parts[vlaanderen]='VRT' @@ -53,6 +53,7 @@ parts[vlaanderen]='VRT' #parts[NU]='NU' #parts[NieuwsNL]='NieuwsNL' #parts[Oog]='Oog' +#parts[Parool]='Parool' #parts[RO]='RO' #parts[RTVNoord]='RTVNoord' #parts[Sargasso]='Sargasso'