diff --git a/.gitignore b/.gitignore
index 795f7f6..713ab78 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,6 +38,8 @@ Sikkom/sikkom
Tzum/metadata
Tzum/tzum
Tzum/xml2txt
+Volkskrant/metadata
+Volkskrant/volkskrant
VRT/metadata
VRT/vrt
bin/data2json
diff --git a/Makefile b/Makefile
index 3fd7538..0513cd9 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,7 @@ all:
make -C Sargasso
make -C Sikkom
make -C Tzum
+ make -C Volkskrant
make -C VRT
make bin/data2json
make bin/dates2json
diff --git a/Parool/cmd/parool/parool.go b/Parool/cmd/parool/parool.go
index 53e5540..b446953 100644
--- a/Parool/cmd/parool/parool.go
+++ b/Parool/cmd/parool/parool.go
@@ -280,6 +280,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
}
header := headers[0]
+ isVideo := false
tags := make([]string, 0)
ell, err := header.Search(`.//*[@data-test-id="article-label"]`)
p(err)
@@ -291,6 +292,9 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
if s != "" && s != "Nieuws" {
tags = append(tags, s)
}
+ if strings.ToLower(s) == "video" {
+ isVideo = true
+ }
}
pars := make([]string, 0)
@@ -344,7 +348,9 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
}
}
if !found {
- _ = w(fmt.Errorf("no text, skipping: %s", url))
+ if !isVideo {
+ _ = w(fmt.Errorf("no text, skipping: %s", url))
+ }
fp, err := os.Create(filename + ".skip")
p(fp.WriteString(url + "\n"))
p(err)
diff --git a/README.md b/README.md
index 1db09c5..fa95f72 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,7 @@ crontab van p209327@colossus
17 * * * * /net/corpora/nlnieuws/Sikkom/sikkom
18 * * * * /net/corpora/nlnieuws/Tzum/tzum
19 * * * * /net/corpora/nlnieuws/VRT/vrt
+20 * * * * /net/corpora/nlnieuws/Volkskrant/volkskrant
```
## 2. Teksten verwerken: omzetten naar zinnen, parsen, metadata toevoegen
@@ -53,6 +54,7 @@ crontab van p209327@colossus
0 1 * * * /net/corpora/nlnieuws/NU/txt2corpus.sh
0 1 * * * /net/corpora/nlnieuws/NieuwsNL/txt2corpus.sh
0 1 * * * /net/corpora/nlnieuws/VRT/txt2corpus.sh
+0 1 * * * /net/corpora/nlnieuws/Volkskrant/txt2corpus.sh
# weinig data: alleen op dinsdag
0 1 * * 2 /net/corpora/nlnieuws/AT5/txt2corpus.sh
0 1 * * 2 /net/corpora/nlnieuws/BuurtAdam/txt2corpus.sh
diff --git a/Tzum/cmd/xml2txt/xml2txt.go b/Tzum/cmd/xml2txt/xml2txt.go
index 3d8e7c6..d8e8b47 100644
--- a/Tzum/cmd/xml2txt/xml2txt.go
+++ b/Tzum/cmd/xml2txt/xml2txt.go
@@ -24,8 +24,6 @@ var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
- reEM = regexp.MustCompile(`::EM::.*?::/EM::`)
- reTitle = regexp.MustCompile(`^\p{Lu}`)
)
func main() {
@@ -68,7 +66,7 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
}
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
- doc, err := gokogiri.ParseHtml([]byte(`
` + em1(u.HtmlFixString(item.Text)) + ``))
+ doc, err := gokogiri.ParseHtml([]byte(`` + u.HtmlFixString(item.Text) + ``))
x(err)
root := doc.Root()
pp, err := root.Search(`//body/p`)
@@ -76,26 +74,9 @@ func main() {
for _, p := range pp {
s := p.Content()
if !strings.Contains(s, "verscheen eerst op Tzum.") {
- x(fp.WriteString(em2(u.AddEnd(u.FixSpace(p.Content())))))
+ x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
}
x(fp.Close())
}
}
-
-func em1(s string) string {
- return strings.ReplaceAll(
- strings.ReplaceAll(s, "", " ::EM::"),
- "",
- "::/EM:: ")
-}
-
-func em2(s string) string {
- return reEM.ReplaceAllStringFunc(s, func(s1 string) string {
- s1 = s1[6 : len(s1)-7]
- if reTitle.MatchString(s1) {
- return `"` + s1 + `"`
- }
- return s1
- })
-}
diff --git a/Volkskrant/Makefile b/Volkskrant/Makefile
new file mode 100644
index 0000000..841a156
--- /dev/null
+++ b/Volkskrant/Makefile
@@ -0,0 +1,9 @@
+all: \
+ metadata \
+ volkskrant
+
+metadata: cmd/metadata/*.go
+ go build -o $@ $^
+
+volkskrant: cmd/volkskrant/*.go ../internal/util/*.go
+ go build -o $@ $<
diff --git a/Volkskrant/cmd/metadata/metadata.go b/Volkskrant/cmd/metadata/metadata.go
new file mode 100644
index 0000000..49f82c7
--- /dev/null
+++ b/Volkskrant/cmd/metadata/metadata.go
@@ -0,0 +1,131 @@
+package main
+
+import (
+ e "codeberg.org/pebbe/errors"
+
+ "bufio"
+ "encoding/xml"
+ "fmt"
+ "html"
+ "os"
+ "strings"
+ "time"
+)
+
+type Item struct {
+ XMLName xml.Name `xml:"item"`
+ UnixTime int64 `xml:"unixTime"`
+}
+
+var (
+ x = e.ExitErr
+ escape = html.EscapeString
+ data = make(map[string][]string)
+ location *time.Location
+)
+
+func main() {
+ var err error
+ location, err = time.LoadLocation("Europe/Amsterdam")
+ x(err)
+
+ files, err := os.ReadDir(".")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if strings.HasSuffix(filename, ".txt") {
+ doText("", filename)
+ } else if strings.HasSuffix(filename, ".xml") {
+ doXml("", filename)
+ }
+ }
+ files, err = os.ReadDir("..")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if strings.HasSuffix(filename, ".txt") {
+ doText("../", filename)
+ } else if strings.HasSuffix(filename, ".xml") {
+ doXml("../", filename)
+ }
+ }
+
+ files, err = os.ReadDir("xml")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if !strings.HasSuffix(filename, ".xml") {
+ continue
+ }
+ aa := strings.Split(filename, ".")
+ base := strings.Join(aa[1:len(aa)-2], ".")
+ b, err := os.ReadFile("xml/" + filename)
+ x(err)
+ s := string(b)
+ i := strings.Index(s, "\n \n"))
+ for _, m := range data[base] {
+ x(fp.WriteString(" " + m + "\n"))
+ }
+ x(fp.WriteString(" \n "))
+ x(fp.WriteString(stripMeta(s[i:])))
+ x(fp.Close())
+ x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
+ }
+}
+
+func doText(dirname, filename string) {
+ base := filename[:len(filename)-4]
+ if _, ok := data[base]; !ok {
+ data[base] = make([]string, 0)
+ }
+ fp, err := os.Open(dirname + filename)
+ x(err)
+ defer func() { x(fp.Close()) }()
+ scanner := bufio.NewScanner(fp)
+ for scanner.Scan() {
+ line := scanner.Text()
+ if !strings.HasPrefix(line, "##META") {
+ continue
+ }
+ aa := strings.Fields(line)
+ if len(aa) > 4 {
+ data[base] = append(data[base],
+ fmt.Sprintf(``,
+ aa[1],
+ escape(aa[2]),
+ escape(strings.Join(aa[4:], " "))))
+ }
+ }
+ x(scanner.Err())
+}
+
+func doXml(dirname, filename string) {
+ base := filename[:len(filename)-4]
+ if _, ok := data[base]; !ok {
+ data[base] = make([]string, 0)
+ }
+ b, err := os.ReadFile(dirname + filename)
+ x(err)
+ var item Item
+ x(xml.Unmarshal(b, &item))
+ t := time.Unix(item.UnixTime, 0).In(location)
+ data[base] = append(data[base],
+ fmt.Sprintf(``,
+ t.Year(),
+ int(t.Month()),
+ t.Day()))
+}
+
+func stripMeta(s string) string {
+ i1 := strings.Index(s, "")
+ if i1 < 0 {
+ return s
+ }
+ i2 := i1 + strings.Index(s[i1:], "") + 11
+ return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
+}
diff --git a/Volkskrant/cmd/volkskrant/volkskrant.go b/Volkskrant/cmd/volkskrant/volkskrant.go
new file mode 100644
index 0000000..121b69d
--- /dev/null
+++ b/Volkskrant/cmd/volkskrant/volkskrant.go
@@ -0,0 +1,387 @@
+package main
+
+import (
+ e "codeberg.org/pebbe/errors"
+ "github.com/jbowtie/gokogiri"
+
+ u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
+ //"encoding/json"
+ "encoding/xml"
+ "fmt"
+ //"html"
+ "io"
+ "net/http"
+ "net/url"
+ "os"
+ "strings"
+ "time"
+)
+
+type Rss struct {
+ XMLName xml.Name `xml:"rss"`
+ Items []ItemT `xml:"channel>item"`
+}
+
+type ItemT struct {
+ PubDate string `xml:"pubDate"`
+ UnixTime int64 `xml:"unixTime"`
+ Guid string `xml:"guid"`
+ Link string `xml:"link"`
+ Title string `xml:"title"`
+ Data []byte `xml:",innerxml"`
+}
+
+/*
+type GraphT struct {
+ Graph []map[string]any `json:"@graph"`
+}
+*/
+
+var (
+ p = e.PanicErr
+ w = e.WarnErr
+ agent = "AhrefsBot/7.0"
+)
+
+func exists(filename string) bool {
+ _, err := os.Stat(filename)
+ return err == nil
+}
+
+func fileDate(filename string) string {
+ b, err := os.ReadFile(filename)
+ if err != nil {
+ return ""
+ }
+ s := string(b)
+ i1 := strings.Index(s, "") + 10
+ i2 := strings.Index(s, "")
+ if i2 < i1 {
+ return ""
+ }
+ return s[i1:i2]
+}
+
+func main() {
+ defer func() {
+ if e.Panicked {
+ _ = recover()
+ os.Exit(1)
+ }
+ }()
+
+ myLock := "/net/corpora/nlnieuws/Volkskrant/lock"
+ u.MkLock(myLock)
+ defer func() {
+ _ = os.Remove(myLock)
+ }()
+
+ req, err := http.NewRequest("GET", "https://www.volkskrant.nl/rss.xml", nil)
+ p(err)
+ req.Header.Set("User-Agent", agent)
+
+ client := &http.Client{}
+ resp, err := client.Do(req)
+ p(err)
+ body, err := io.ReadAll(resp.Body)
+ p(err)
+ p(resp.Body.Close())
+
+ var rss Rss
+ p(xml.Unmarshal(body, &rss))
+
+ if len(rss.Items) == 0 {
+ p(fmt.Errorf("len(rss.Items) == 0"))
+ }
+
+ for _, item := range rss.Items {
+ t, err := time.Parse(time.RFC1123Z, item.PubDate)
+ if err != nil {
+ t, err = time.Parse(time.RFC1123, item.PubDate)
+ }
+ p(err)
+ dirname := fmt.Sprintf("/net/corpora/nlnieuws/Volkskrant/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
+ if exists(dirname + "/lock") {
+ continue
+ }
+ basename := item.Guid
+ filename := dirname + "/" + url.PathEscape(basename)
+
+ ts := fmt.Sprintf("%d", t.Unix())
+ needUpdate := fileDate(filename+".xml") != ts
+
+ p(os.MkdirAll(dirname, 0777))
+ func() {
+ var ok bool
+ defer func() {
+ if e.Panicked {
+ fmt.Fprintln(os.Stderr, "----", filename)
+ fmt.Fprintln(os.Stderr, "----", item.Link)
+ }
+ if !ok {
+ _ = os.Remove(filename + ".xml")
+ }
+ }()
+ fp, err := os.Create(filename + ".xml")
+ p(err)
+ p(fp.WriteString("\n- \n"))
+ p(fmt.Fprintf(fp, "%d", t.Unix()))
+ p(fp.Write(item.Data))
+ p(fp.WriteString("
\n"))
+ p(fp.Close())
+ p(os.Chtimes(filename+".xml", t, t))
+ ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
+ }()
+ }
+}
+
+func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
+ if exists(filename + ".skip") {
+ return true
+ }
+ if needUpdate {
+ _ = os.Remove(filename + ".err")
+ _ = os.Remove(filename + ".html")
+ // _ = os.Remove(filename + ".json")
+ _ = os.Remove(filename + ".txt")
+ } else {
+ if exists(filename + ".txt") {
+ return true
+ }
+ }
+ time.Sleep(2 * time.Second)
+
+ req, err := http.NewRequest("GET", url, nil)
+ p(err)
+ req.Header.Set("User-Agent", agent)
+
+ client := &http.Client{}
+ resp, err := client.Do(req)
+ p(err)
+ body, err := io.ReadAll(resp.Body)
+ p(err)
+ p(resp.Body.Close())
+
+ body = u.HtmlFix(body)
+
+ doc, err := gokogiri.ParseHtml(body)
+ p(err)
+
+ /*
+
+ s := string(body)
+
+ ok = true
+ i1 := strings.Index(s, ``)
+ if i2 < i1 {
+ ok = false
+ } else {
+ s = html.UnescapeString(s[i1:i2])
+ }
+ }
+ if !ok {
+ _ = w(fmt.Errorf("script jsonld not found: %s", url))
+
+ fp, err := os.Create(filename + ".err")
+ p(err)
+ p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
+ p(fp.Close())
+ p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+ fp, err = os.Create(filename + ".html")
+ p(err)
+ p(fp.Write(body))
+ p(fp.Close())
+ p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+ return false
+ }
+
+ var graph GraphT
+ p(json.Unmarshal([]byte(s), &graph))
+ for _, g := range graph.Graph {
+ t := g["@type"]
+ switch v := t.(type) {
+ case string:
+ if v == "NewsArticle" {
+ b, err := json.Marshal(g)
+ p(err)
+ s = string(b)
+ }
+ }
+ }
+
+ fp, err := os.Create(filename + ".json")
+ p(err)
+ p(fp.WriteString(s))
+ p(fp.Close())
+ p(os.Chtimes(filename+".json", timestamp, timestamp))
+ */
+
+ root := doc.Root()
+
+ articles, err := root.Search(`//article[@id="article-content"]`)
+ p(err)
+ if len(articles) == 0 {
+ _ = w(fmt.Errorf("empty: %s", url))
+
+ fp, err := os.Create(filename + ".err")
+ p(err)
+ p(fmt.Fprintf(fp, "empty: %s\n", url))
+ p(fp.Close())
+ p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+ fp, err = os.Create(filename + ".html")
+ p(err)
+ p(fp.Write(body))
+ p(fp.Close())
+ p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+ return false
+ }
+ article := articles[0]
+
+ live, err := article.Search(`.//*[@data-test-id="live-blog-label"]`)
+ p(err)
+ if len(live) > 0 {
+ fp, err := os.Create(filename + ".skip")
+ p(fp.WriteString("liveblog\n"))
+ p(err)
+ p(os.Chtimes(filename+".skip", timestamp, timestamp))
+ return true
+ }
+
+ headers, err := article.Search(`.//header`)
+ p(err)
+ if len(headers) == 0 {
+ _ = w(fmt.Errorf("no header: %s", url))
+
+ fp, err := os.Create(filename + ".err")
+ p(err)
+ p(fmt.Fprintf(fp, "no elements: %s\n", url))
+ p(fp.Close())
+ p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+ fp, err = os.Create(filename + ".html")
+ p(err)
+ p(fp.Write(body))
+ p(fp.Close())
+ p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+ return false
+
+ }
+ header := headers[0]
+
+ isOpinie := false
+ tags := make([]string, 0)
+ ell, err := header.Search(`.//*[@data-test-id="article-label"]`)
+ p(err)
+ if len(ell) == 0 {
+ _ = w(fmt.Errorf("no labels: %s", url))
+ }
+ for _, el := range ell {
+ s := strings.TrimSpace(el.Content())
+ if s != "" && s != "Nieuws" {
+ tags = append(tags, s)
+ }
+ if strings.ToLower(s) == "opinie" {
+ isOpinie = true
+ }
+ }
+
+ pars := make([]string, 0)
+
+ found := false
+ ell, err = header.Search(`.//*[@data-test-id="header-intro"]`)
+ p(err)
+ for _, el := range ell {
+ s := strings.TrimSpace(el.Content())
+ if s != "" {
+ pars = append(pars, s)
+ found = true
+ }
+ }
+ if !found && !isOpinie {
+ _ = w(fmt.Errorf("no intro: %s", url))
+ }
+
+ specials, err := article.Search(`.//section//aside | .//section//figure | .//section//b`)
+ p(err)
+ for _, special := range specials {
+ special.Remove()
+ }
+
+ ell, err = article.Search(`.//section//*[@data-article-element-index]`)
+ p(err)
+ if len(ell) == 0 {
+ _ = w(fmt.Errorf("no elements: %s", url))
+
+ fp, err := os.Create(filename + ".err")
+ p(err)
+ p(fmt.Fprintf(fp, "no elements: %s\n", url))
+ p(fp.Close())
+ p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+ fp, err = os.Create(filename + ".html")
+ p(err)
+ p(fp.Write(body))
+ p(fp.Close())
+ p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+ return false
+ }
+
+ found = false
+ for _, el := range ell {
+ s := strings.TrimSpace(el.Content())
+ if s != "" {
+ pars = append(pars, s)
+ found = true
+ }
+ }
+ if !found {
+ _ = w(fmt.Errorf("no text, skipping: %s", url))
+ fp, err := os.Create(filename + ".skip")
+ p(fp.WriteString(url + "\n"))
+ p(err)
+ p(os.Chtimes(filename+".skip", timestamp, timestamp))
+
+ fp, err = os.Create(filename + ".html")
+ p(err)
+ p(fp.Write(body))
+ p(fp.Close())
+ p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+ return true
+ }
+
+ fp, err := os.Create(filename + ".txt")
+ p(err)
+
+ if len(tags) == 0 {
+ p(fmt.Fprintln(fp, "##META text tag ="))
+ } else {
+ for _, tag := range tags {
+ p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
+ }
+ }
+
+ p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
+
+ for _, par := range pars {
+ p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
+ }
+
+ p(fp.Close())
+
+ p(os.Chtimes(filename+".txt", timestamp, timestamp))
+
+ return true
+}
diff --git a/Volkskrant/txt2corpus.sh b/Volkskrant/txt2corpus.sh
new file mode 100755
index 0000000..eddb31d
--- /dev/null
+++ b/Volkskrant/txt2corpus.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+set -e
+
+BASE=/net/corpora/nlnieuws
+PART=$BASE/Volkskrant
+
+unset CDPATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
+export TZ=Europe/Amsterdam
+. /net/aps/etc/alpino-activate.sh > /dev/null
+
+if [ "$1" = "" ]
+then
+ ds=`date -d -2days +%Y-%m-%d`
+else
+ case "$1" in
+ 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
+ ds=$1
+ ;;
+ *)
+ echo INVALID
+ exit 1
+ ;;
+ esac
+fi
+
+dp=${ds//-//}
+year=${ds%%-*}
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year
+
+cd $PART/$dp
+
+ln -s lock.$$ lock
+if [ "`readlink lock`" != lock.$$ ]
+then
+ echo Getting lock failed
+ exit 1
+fi
+
+rm -fr out
+mkdir out
+
+rm -f $corpus.lines
+for i in *.txt
+do
+ b=`basename $i .txt`
+ perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
+ | perl -e '$n = 0; while(<>) { $n++; print("vk.'$b'.$n|$_"); }' \
+ >> $corpus.lines
+done
+
+cd out
+mkdir xml
+Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
+
+metadata
+
+cd xml
+rm -f $corpus.data.dz $corpus.index
+alto -q -o $corpus.data.dz *.xml
+
+# telling per bericht, niet per zin
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+
+cd ../..
+rm -fr out
+
+rm -f lock
diff --git a/collect.sh b/collect.sh
index 43b5ce8..056ba1b 100755
--- a/collect.sh
+++ b/collect.sh
@@ -43,7 +43,7 @@ cd /net/corpora/nlnieuws/data/$year
declare -A parts
#parts[alles]='.'
-parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso'
+parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso|Volkskrant'
parts[amsterdam]='AT5|BuurtAdam|Parool'
parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom'
parts[literatuur]='LitNL|Tzum'
@@ -64,6 +64,7 @@ parts[vlaanderen]='HLN|VRT'
#parts[Sargasso]='Sargasso'
#parts[Sikkom]='Sikkom'
#parts[Tzum]='Tzum'
+#parts[Volkskrant]='Volkskrant'
#parts[VRT]='VRT'
for part in ${!parts[@]}
diff --git a/internal/util/util.go b/internal/util/util.go
index ee5a896..a1a12e1 100644
--- a/internal/util/util.go
+++ b/internal/util/util.go
@@ -11,19 +11,25 @@ import (
)
var (
- p = e.PanicErr
- reEOL = regexp.MustCompile(`[.!?]['"”’]?$`)
- reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}+\.?`)
- reLET = regexp.MustCompile(`\p{Lu}`)
- reBody = regexp.MustCompile(`<[bB][rR][ /]*>`)
+ p = e.PanicErr
+ reEOL = regexp.MustCompile(`[.!?]['"”’]?$`)
+ reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}+\.?`)
+ reLET = regexp.MustCompile(`\p{Lu}`)
+ reBody = regexp.MustCompile(`<[bB][rR][ /]*>`)
+ reQuotLeft = regexp.MustCompile(`|`)
+ reQuotRight = regexp.MustCompile(`|`)
)
-func HtmlFix(body []byte) []byte {
- return reBody.ReplaceAllLiteral(body, []byte(" "))
+func HtmlFix(html []byte) []byte {
+ html = reQuotLeft.ReplaceAllLiteral(html, []byte(" „"))
+ html = reQuotRight.ReplaceAllLiteral(html, []byte("” "))
+ return reBody.ReplaceAllLiteral(html, []byte(" "))
}
-func HtmlFixString(body string) string {
- return reBody.ReplaceAllLiteralString(body, " ")
+func HtmlFixString(html string) string {
+ html = reQuotLeft.ReplaceAllLiteralString(html, " „")
+ html = reQuotRight.ReplaceAllLiteralString(html, "” ")
+ return reBody.ReplaceAllLiteralString(html, " ")
}
func AddEnd(s string) string {
diff --git a/www/app.html b/www/app.html
index 21b8457..fa63a94 100644
--- a/www/app.html
+++ b/www/app.html
@@ -110,11 +110,16 @@
|
Reporters Online |
-
+
|
|
Sargasso |
+
+ |
+ |
+ de Volkskrant |
+
| Amsterdam |
|
@@ -122,16 +127,20 @@
|
- |
+ |
- In de buurt Amsterdam
+ Het Parool | Amsterdam
|
|
- |
+ |
- Parool Amsterdam
+ In de buurt | Amsterdam
|
@@ -147,7 +156,9 @@
|
|
- In de buurt Groningen
+ In de buurt | Groningen
|