diff --git a/AT5/Makefile b/AT5/Makefile index 63bab1b..02a7d6a 100644 --- a/AT5/Makefile +++ b/AT5/Makefile @@ -3,8 +3,8 @@ all: \ metadata \ at5 -xml2txt: cmd/xml2txt/*.go - go build -o $@ $^ +xml2txt: cmd/xml2txt/*.go ../internal/util/*.go + go build -o $@ $< metadata: cmd/metadata/*.go go build -o $@ $^ diff --git a/AT5/cmd/xml2txt/xml2txt.go b/AT5/cmd/xml2txt/xml2txt.go index 1d32880..f249581 100644 --- a/AT5/cmd/xml2txt/xml2txt.go +++ b/AT5/cmd/xml2txt/xml2txt.go @@ -4,6 +4,8 @@ import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "encoding/xml" "fmt" "os" @@ -55,39 +57,15 @@ func main() { x(err) var item Item x(xml.Unmarshal(b, &item), filename) - x(fp.WriteString(addEnd(fixSpace(item.Title)))) + x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() pp, err := root.Search(`//body/p | //body/h2`) x(err) for _, p := range pp { - x(fp.WriteString(addEnd(fixSpace(p.Content())))) + x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content())))) } x(fp.Close()) } } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} diff --git a/BuurtAdam/Makefile b/BuurtAdam/Makefile index e00dcbf..6266d5a 100644 --- a/BuurtAdam/Makefile +++ b/BuurtAdam/Makefile @@ -5,5 +5,5 @@ all: \ metadata: cmd/metadata/*.go go build -o $@ $^ -buurtadam: cmd/buurtadam/*.go - go build -o $@ $^ +buurtadam: cmd/buurtadam/*.go ../internal/util/*.go + go build -o $@ $< diff --git a/BuurtAdam/cmd/buurtadam/buurtadam.go b/BuurtAdam/cmd/buurtadam/buurtadam.go index 7b5f348..77ce869 100644 --- a/BuurtAdam/cmd/buurtadam/buurtadam.go +++ b/BuurtAdam/cmd/buurtadam/buurtadam.go @@ -4,13 +4,14 @@ import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "encoding/xml" "fmt" "io" "net/http" "net/url" "os" - "path/filepath" "strings" "time" ) @@ -64,7 +65,7 @@ func main() { }() myLock := "/net/corpora/nlnieuws/BuurtAdam/lock" - mkLock(myLock) + u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() @@ -202,7 +203,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n } for _, div := range divs { - p(fp.WriteString(addEnd(fixSpace(div.Content())))) + p(fp.WriteString(u.AddEnd(u.FixSpace(div.Content())))) } p(fp.Close()) @@ -211,40 +212,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n return true } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} - -func mkLock(filename string) { - pid := os.Getpid() - link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) - p(os.Symlink(link, filename)) - - name, err := os.Readlink(filename) - p(err) - - if name != link { - p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) - } -} diff --git a/BuurtGrn/Makefile b/BuurtGrn/Makefile index edb80e2..457e616 100644 --- a/BuurtGrn/Makefile +++ b/BuurtGrn/Makefile @@ -5,5 +5,5 @@ all: \ metadata: cmd/metadata/*.go go build -o $@ $^ -buurtgrn: cmd/buurtgrn/*.go - go build -o $@ $^ +buurtgrn: cmd/buurtgrn/*.go ../internal/util/*.go + go build -o $@ $< diff --git a/BuurtGrn/cmd/buurtgrn/buurtgrn.go b/BuurtGrn/cmd/buurtgrn/buurtgrn.go index 852d25f..58b3062 100644 --- a/BuurtGrn/cmd/buurtgrn/buurtgrn.go +++ b/BuurtGrn/cmd/buurtgrn/buurtgrn.go @@ -6,11 +6,11 @@ import ( "encoding/xml" "fmt" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" "io" "net/http" "net/url" "os" - "path/filepath" "strings" "time" ) @@ -64,7 +64,7 @@ func main() { }() myLock := "/net/corpora/nlnieuws/BuurtGrn/lock" - mkLock(myLock) + u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() @@ -202,7 +202,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n } for _, div := range divs { - p(fp.WriteString(addEnd(fixSpace(div.Content())))) + p(fp.WriteString(u.AddEnd(u.FixSpace(div.Content())))) } p(fp.Close()) @@ -211,40 +211,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n return true } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} - -func mkLock(filename string) { - pid := os.Getpid() - link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) - p(os.Symlink(link, filename)) - - name, err := os.Readlink(filename) - p(err) - - if name != link { - p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) - } -} diff --git a/GG/Makefile b/GG/Makefile index 2bf1e11..a0cbe08 100644 --- a/GG/Makefile +++ b/GG/Makefile @@ -5,5 +5,5 @@ all: \ metadata: cmd/metadata/*.go go build -o $@ $^ -gg: cmd/gg/*.go - go build -o $@ $^ +gg: cmd/gg/*.go ../internal/util/*.go + go build -o $@ $< diff --git a/GG/cmd/gg/gg.go b/GG/cmd/gg/gg.go index 5ca8bd1..72a3b4b 100644 --- a/GG/cmd/gg/gg.go +++ b/GG/cmd/gg/gg.go @@ -4,13 +4,14 @@ import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "encoding/xml" "fmt" "io" "net/http" "net/url" "os" - "path/filepath" "strings" "time" ) @@ -64,7 +65,7 @@ func main() { }() myLock := "/net/corpora/nlnieuws/GG/lock" - mkLock(myLock) + u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() @@ -191,10 +192,10 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n fp, err := os.Create(filename + ".txt") p(err) - p(fp.WriteString(addEnd(fixSpace(title)))) + p(fp.WriteString(u.AddEnd(u.FixSpace(title)))) for _, el := range ell { - p(fp.WriteString(addEnd(fixSpace(el.Content())))) + p(fp.WriteString(u.AddEnd(u.FixSpace(el.Content())))) } p(fp.Close()) @@ -203,40 +204,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n return true } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} - -func mkLock(filename string) { - pid := os.Getpid() - link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) - p(os.Symlink(link, filename)) - - name, err := os.Readlink(filename) - p(err) - - if name != link { - p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) - } -} diff --git a/HLN/Makefile b/HLN/Makefile index 911b43f..daa3a67 100644 --- a/HLN/Makefile +++ b/HLN/Makefile @@ -5,5 +5,5 @@ all: \ metadata: cmd/metadata/*.go go build -o $@ $^ -hln: cmd/hln/*.go - go build -o $@ $^ +hln: cmd/hln/*.go ../internal/util/*.go + go build -o $@ $< diff --git a/HLN/cmd/hln/hln.go b/HLN/cmd/hln/hln.go index 7cf4317..bdfefce 100644 --- a/HLN/cmd/hln/hln.go +++ b/HLN/cmd/hln/hln.go @@ -4,13 +4,16 @@ import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + + // "encoding/json" "encoding/xml" "fmt" + // "html" "io" "net/http" "net/url" "os" - "path/filepath" "strings" "time" ) @@ -25,9 +28,16 @@ type ItemT struct { UnixTime int64 `xml:"unixTime"` Guid string `xml:"guid"` Link string `xml:"link"` + Title string `xml:"title"` Data []byte `xml:",innerxml"` } +/* +type GraphT struct { + Graph []map[string]any `json:"@graph"` +} +*/ + var ( p = e.PanicErr w = e.WarnErr @@ -62,7 +72,7 @@ func main() { }() myLock := "/net/corpora/nlnieuws/HLN/lock" - mkLock(myLock) + u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() @@ -127,18 +137,19 @@ func main() { p(fp.WriteString("\n")) p(fp.Close()) p(os.Chtimes(filename+".xml", t, t)) - ok = doArticle(filename, item.Link, t, needUpdate) + ok = doArticle(filename, item.Link, item.Title, t, needUpdate) }() } } -func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) (ok bool) { +func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) { if exists(filename + ".skip") { return true } if needUpdate { _ = os.Remove(filename + ".err") _ = os.Remove(filename + ".html") + // _ = os.Remove(filename + ".json") _ = os.Remove(filename + ".txt") } else { if exists(filename + ".txt") { @@ -158,6 +169,60 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool p(err) p(resp.Body.Close()) + /* + s := string(body) + ok = true + i1 := strings.Index(s, `type="application/ld+json"`) + if i1 < 0 { + ok = false + } else { + i1 += strings.Index(s[i1:], `>`) + 1 + i2 := i1 + strings.Index(s[i1:], ``) + if i2 < i1 { + ok = false + } else { + s = html.UnescapeString(s[i1:i2]) + } + } + if !ok { + _ = w(fmt.Errorf("script jsonld not found: %s", url)) + + fp, err := os.Create(filename + ".err") + p(err) + p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + + var graph GraphT + p(json.Unmarshal([]byte(s), &graph)) + for _, g := range graph.Graph { + t := g["@type"] + switch v := t.(type) { + case string: + if v == "NewsArticle" { + b, err := json.Marshal(g) + p(err) + s = string(b) + } + } + } + + fp, err := os.Create(filename + ".json") + p(err) + p(fp.WriteString(s)) + p(fp.Close()) + p(os.Chtimes(filename+".json", timestamp, timestamp)) + */ + doc, err := gokogiri.ParseHtml(body) p(err) @@ -196,18 +261,6 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool pars := make([]string, 0) - ell, err = article.Search(`.//*[@data-content-type="TITLE"]`) - p(err) - if len(ell) != 1 { - _ = w(fmt.Errorf("found %d titles: %s", len(ell), url)) - } - for _, el := range ell { - s := strings.TrimSpace(el.Content()) - if s != "" { - pars = append(pars, s) - } - } - hasIntro := false ell, err = article.Search(`.//*[@data-content-type="INTRO"]`) p(err) @@ -285,12 +338,14 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool p(fmt.Fprintln(fp, "##META text tag =")) } else { for _, tag := range tags { - p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag))) + p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag))) } } + p(fp.WriteString(u.AddEnd(u.FixSpace(title)))) + for _, par := range pars { - p(fp.WriteString(addEnd(fixSpace(par)))) + p(fp.WriteString(u.AddEnd(u.FixSpace(par)))) } p(fp.Close()) @@ -299,43 +354,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool return true } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - if strings.HasSuffix(s, `.”`) || strings.HasSuffix(s, `!”`) || strings.HasSuffix(s, `?”`) { - return s + "\n" - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} - -func mkLock(filename string) { - pid := os.Getpid() - link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) - p(os.Symlink(link, filename)) - - name, err := os.Readlink(filename) - p(err) - - if name != link { - p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) - } -} diff --git a/LitNL/Makefile b/LitNL/Makefile index 9a6b040..e2391e3 100644 --- a/LitNL/Makefile +++ b/LitNL/Makefile @@ -3,11 +3,11 @@ all: \ metadata \ litnl -xml2txt: cmd/xml2txt/*.go - go build -o $@ $^ +xml2txt: cmd/xml2txt/*.go ../internal/util/*.go + go build -o $@ $< metadata: cmd/metadata/*.go go build -o $@ $^ -litnl: cmd/litnl/*.go - go build -o $@ $^ +litnl: cmd/litnl/*.go ../internal/util/*.go + go build -o $@ $< diff --git a/LitNL/cmd/litnl/litnl.go b/LitNL/cmd/litnl/litnl.go index 56aa5b2..afa1a0e 100644 --- a/LitNL/cmd/litnl/litnl.go +++ b/LitNL/cmd/litnl/litnl.go @@ -3,13 +3,14 @@ package main import ( e "codeberg.org/pebbe/errors" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "encoding/xml" "fmt" "io" "net/http" "net/url" "os" - "path/filepath" "strings" "time" ) @@ -46,7 +47,7 @@ func main() { }() myLock := "/net/corpora/nlnieuws/LitNL/lock" - mkLock(myLock) + u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() @@ -108,16 +109,3 @@ func main() { } } - -func mkLock(filename string) { - pid := os.Getpid() - link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) - p(os.Symlink(link, filename)) - - name, err := os.Readlink(filename) - p(err) - - if name != link { - p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) - } -} diff --git a/LitNL/cmd/xml2txt/xml2txt.go b/LitNL/cmd/xml2txt/xml2txt.go index 123c31c..777071a 100644 --- a/LitNL/cmd/xml2txt/xml2txt.go +++ b/LitNL/cmd/xml2txt/xml2txt.go @@ -4,6 +4,8 @@ import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "encoding/xml" "fmt" "os" @@ -58,9 +60,9 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat))) + x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) } - x(fp.WriteString(addEnd(fixSpace(item.Title)))) + x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() @@ -74,32 +76,8 @@ func main() { _ = w(fmt.Errorf("empty: %s", filename)) } for _, p := range pp { - x(fp.WriteString(addEnd(fixSpace(p.Content())))) + x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content())))) } x(fp.Close()) } } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} diff --git a/NOS/Makefile b/NOS/Makefile index d66f8cf..f30d292 100644 --- a/NOS/Makefile +++ b/NOS/Makefile @@ -3,8 +3,8 @@ all: \ metadata \ nos -json2txt: cmd/json2txt/*.go - go build -o $@ $^ +json2txt: cmd/json2txt/*.go ../internal/util/*.go + go build -o $@ $< metadata: cmd/metadata/*.go go build -o $@ $^ diff --git a/NOS/cmd/json2txt/json2txt.go b/NOS/cmd/json2txt/json2txt.go index 72cc887..8c9e0b1 100644 --- a/NOS/cmd/json2txt/json2txt.go +++ b/NOS/cmd/json2txt/json2txt.go @@ -3,6 +3,8 @@ package main import ( e "codeberg.org/pebbe/errors" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "encoding/json" "fmt" "os" @@ -61,13 +63,13 @@ func main() { x(err) item := getItem(b, filename) for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) + x(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(cat))) } for _, tag := range item.Tags { - x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag))) + x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag))) } - x(fp.WriteString(addEnd(fixSpace(item.Title)))) - x(fp.WriteString(fixSpace(item.Text))) + x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) + x(fp.WriteString(u.FixSpace(item.Text))) x(fp.Close()) } } @@ -90,27 +92,3 @@ func getItem(b []byte, filename string) Item { x(json.Unmarshal(b, &item), filename) return item } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} diff --git a/NOS/cmd/nos/nos.go b/NOS/cmd/nos/nos.go index f435f18..72291e4 100644 --- a/NOS/cmd/nos/nos.go +++ b/NOS/cmd/nos/nos.go @@ -1,12 +1,11 @@ package main import ( - "html" - e "codeberg.org/pebbe/errors" "encoding/xml" "fmt" + "html" "io" "net/http" "net/url" diff --git a/NieuwsNL/Makefile b/NieuwsNL/Makefile index 3e286e0..a61d678 100644 --- a/NieuwsNL/Makefile +++ b/NieuwsNL/Makefile @@ -5,5 +5,5 @@ all: \ metadata: cmd/metadata/*.go go build -o $@ $^ -nieuwsnl: cmd/nieuwsnl/*.go - go build -o $@ $^ +nieuwsnl: cmd/nieuwsnl/*.go ../internal/util/*.go + go build -o $@ $< diff --git a/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go index 9e70076..947fa1d 100644 --- a/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go +++ b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go @@ -4,6 +4,8 @@ import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "bytes" "encoding/xml" "fmt" @@ -11,7 +13,6 @@ import ( "net/http" "net/url" "os" - "path/filepath" "strings" "time" ) @@ -64,7 +65,7 @@ func main() { }() myLock := "/net/corpora/nlnieuws/NieuwsNL/lock" - mkLock(myLock) + u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() @@ -173,11 +174,11 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n } else { for _, a := range aa { tag = strings.ReplaceAll(a.Content(), "\n", " ") - p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag))) + p(fmt.Fprintf(&buf, "##META text tag = %s\n", u.FixSpace(tag))) } } - p(buf.WriteString(addEnd(fixSpace(title)))) + p(buf.WriteString(u.AddEnd(u.FixSpace(title)))) // oud: //div[@id="article-blocks"]//p pp, err := root.Search(`//div[@id="article-blocks"]//div[contains(@class, "paragraph-content")]`) @@ -204,7 +205,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n return false // echt fout } for _, p1 := range pp { - p(buf.WriteString(addEnd(fixSpace(p1.Content())))) + p(buf.WriteString(u.AddEnd(u.FixSpace(p1.Content())))) } fp, err := os.Create(filename + ".txt") @@ -216,40 +217,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n return true } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} - -func mkLock(filename string) { - pid := os.Getpid() - link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) - p(os.Symlink(link, filename)) - - name, err := os.Readlink(filename) - p(err) - - if name != link { - p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) - } -} diff --git a/Oog/Makefile b/Oog/Makefile index 1b26dc9..66f58fe 100644 --- a/Oog/Makefile +++ b/Oog/Makefile @@ -3,8 +3,8 @@ all: \ metadata \ oog -xml2txt: cmd/xml2txt/*.go - go build -o $@ $^ +xml2txt: cmd/xml2txt/*.go ../internal/util/*.go + go build -o $@ $< metadata: cmd/metadata/*.go go build -o $@ $^ diff --git a/Oog/cmd/xml2txt/xml2txt.go b/Oog/cmd/xml2txt/xml2txt.go index 4be9dc4..8a8f171 100644 --- a/Oog/cmd/xml2txt/xml2txt.go +++ b/Oog/cmd/xml2txt/xml2txt.go @@ -4,6 +4,8 @@ import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "encoding/xml" "fmt" "os" @@ -57,41 +59,17 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat))) + x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) } - x(fp.WriteString(addEnd(fixSpace(item.Title)))) + x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() pp, err := root.Search(`//body/p`) x(err) for _, p := range pp { - x(fp.WriteString(addEnd(fixSpace(p.Content())))) + x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content())))) } x(fp.Close()) } } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} diff --git a/Parool/Makefile b/Parool/Makefile index 3575e76..d0b1777 100644 --- a/Parool/Makefile +++ b/Parool/Makefile @@ -5,5 +5,5 @@ all: \ metadata: cmd/metadata/*.go go build -o $@ $^ -parool: cmd/parool/*.go - go build -o $@ $^ +parool: cmd/parool/*.go ../internal/util/*.go + go build -o $@ $< diff --git a/Parool/cmd/parool/parool.go b/Parool/cmd/parool/parool.go index 37105d0..18aeb31 100644 --- a/Parool/cmd/parool/parool.go +++ b/Parool/cmd/parool/parool.go @@ -4,13 +4,16 @@ import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + + //"encoding/json" "encoding/xml" "fmt" + //"html" "io" "net/http" "net/url" "os" - "path/filepath" "strings" "time" ) @@ -25,9 +28,16 @@ type ItemT struct { UnixTime int64 `xml:"unixTime"` Guid string `xml:"guid"` Link string `xml:"link"` + Title string `xml:"title"` Data []byte `xml:",innerxml"` } +/* +type GraphT struct { + Graph []map[string]any `json:"@graph"` +} +*/ + var ( p = e.PanicErr w = e.WarnErr @@ -62,7 +72,7 @@ func main() { }() myLock := "/net/corpora/nlnieuws/Parool/lock" - mkLock(myLock) + u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() @@ -122,18 +132,19 @@ func main() { p(fp.WriteString("\n")) p(fp.Close()) p(os.Chtimes(filename+".xml", t, t)) - ok = doArticle(filename, item.Link, t, needUpdate) + ok = doArticle(filename, item.Link, item.Title, t, needUpdate) }() } } -func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) (ok bool) { +func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) { if exists(filename + ".skip") { return true } if needUpdate { _ = os.Remove(filename + ".err") _ = os.Remove(filename + ".html") + // _ = os.Remove(filename + ".json") _ = os.Remove(filename + ".txt") } else { if exists(filename + ".txt") { @@ -156,6 +167,62 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool doc, err := gokogiri.ParseHtml(body) p(err) + /* + + s := string(body) + + ok = true + i1 := strings.Index(s, ``) + if i2 < i1 { + ok = false + } else { + s = html.UnescapeString(s[i1:i2]) + } + } + if !ok { + _ = w(fmt.Errorf("script jsonld not found: %s", url)) + + fp, err := os.Create(filename + ".err") + p(err) + p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + + var graph GraphT + p(json.Unmarshal([]byte(s), &graph)) + for _, g := range graph.Graph { + t := g["@type"] + switch v := t.(type) { + case string: + if v == "NewsArticle" { + b, err := json.Marshal(g) + p(err) + s = string(b) + } + } + } + + fp, err := os.Create(filename + ".json") + p(err) + p(fp.WriteString(s)) + p(fp.Close()) + p(os.Chtimes(filename+".json", timestamp, timestamp)) + */ + root := doc.Root() articles, err := root.Search(`//article[@id="article-content"]`) @@ -226,18 +293,6 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool pars := make([]string, 0) - ell, err = header.Search(`.//*[@data-test-id="article-title"]`) - p(err) - if len(ell) != 1 { - _ = w(fmt.Errorf("found %d titles: %s", len(ell), url)) - } - for _, el := range ell { - s := strings.TrimSpace(el.Content()) - if s != "" { - pars = append(pars, s) - } - } - found := false ell, err = header.Search(`.//*[@data-test-id="header-intro"]`) p(err) @@ -309,12 +364,14 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool p(fmt.Fprintln(fp, "##META text tag =")) } else { for _, tag := range tags { - p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag))) + p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag))) } } + p(fp.WriteString(u.AddEnd(u.FixSpace(title)))) + for _, par := range pars { - p(fp.WriteString(addEnd(fixSpace(par)))) + p(fp.WriteString(u.AddEnd(u.FixSpace(par)))) } p(fp.Close()) @@ -323,43 +380,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool return true } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - if strings.HasSuffix(s, `.”`) || strings.HasSuffix(s, `!”`) || strings.HasSuffix(s, `?”`) { - return s + "\n" - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} - -func mkLock(filename string) { - pid := os.Getpid() - link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) - p(os.Symlink(link, filename)) - - name, err := os.Readlink(filename) - p(err) - - if name != link { - p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) - } -} diff --git a/RO/Makefile b/RO/Makefile index c3d8f3f..ee3079c 100644 --- a/RO/Makefile +++ b/RO/Makefile @@ -3,8 +3,8 @@ all: \ metadata \ ro -xml2txt: cmd/xml2txt/*.go - go build -o $@ $^ +xml2txt: cmd/xml2txt/*.go ../internal/util/*.go + go build -o $@ $< metadata: cmd/metadata/*.go go build -o $@ $^ diff --git a/RO/cmd/xml2txt/xml2txt.go b/RO/cmd/xml2txt/xml2txt.go index 8ce8a5d..9260b1e 100644 --- a/RO/cmd/xml2txt/xml2txt.go +++ b/RO/cmd/xml2txt/xml2txt.go @@ -5,6 +5,8 @@ import ( "github.com/jbowtie/gokogiri" "github.com/pebbe/textcat/v2" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "bytes" "encoding/xml" "fmt" @@ -61,7 +63,7 @@ func main() { var buf bytes.Buffer var item Item x(xml.Unmarshal(b, &item)) - x(buf.WriteString(addEnd(fixSpace(item.Title)))) + x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() @@ -73,7 +75,7 @@ func main() { pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`) x(err) for _, p := range pp { - x(buf.WriteString(addEnd(fixSpace(p.Content())))) + x(buf.WriteString(u.AddEnd(u.FixSpace(p.Content())))) } text := buf.String() @@ -90,33 +92,9 @@ func main() { fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") x(err) for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat))) + x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) } x(fp.WriteString(text)) x(fp.Close()) } } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} diff --git a/RTVNoord/Makefile b/RTVNoord/Makefile index d736fd2..3f11722 100644 --- a/RTVNoord/Makefile +++ b/RTVNoord/Makefile @@ -5,5 +5,5 @@ all: \ metadata: cmd/metadata/*.go go build -o $@ $^ -rtvnoord: cmd/rtvnoord/*.go - go build -o $@ $^ +rtvnoord: cmd/rtvnoord/*.go ../internal/util/*.go + go build -o $@ $< diff --git a/RTVNoord/cmd/rtvnoord/rtvnoord.go b/RTVNoord/cmd/rtvnoord/rtvnoord.go index 3c571d2..f16a712 100644 --- a/RTVNoord/cmd/rtvnoord/rtvnoord.go +++ b/RTVNoord/cmd/rtvnoord/rtvnoord.go @@ -3,6 +3,8 @@ package main import ( e "codeberg.org/pebbe/errors" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "encoding/json" "encoding/xml" "fmt" @@ -10,7 +12,6 @@ import ( "io" "net/http" "os" - "path/filepath" "strings" "time" ) @@ -75,7 +76,7 @@ func main() { }() myLock := "/net/corpora/nlnieuws/RTVNoord/lock" - mkLock(myLock) + u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() @@ -225,7 +226,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool // text bevat kopjes zonder punt aan het eind lines := strings.Split(doc.Text, "\n") for i, line := range lines { - lines[i] = addEnd(fixSpace(line)) + lines[i] = u.AddEnd(u.FixSpace(line)) } text := strings.Join(lines, "") + "\n" @@ -235,16 +236,16 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool p(fmt.Fprintln(fp, "##META text tag =")) } else { for _, tag := range doc.Tags { - p(fmt.Fprintf(fp, "##META text tag = %s\n", strings.ToLower(fixSpace(tag)))) + p(fmt.Fprintf(fp, "##META text tag = %s\n", strings.ToLower(u.FixSpace(tag)))) } } if doc.Cat == "" { p(fmt.Fprintln(fp, "##META text cat =")) } else { - p(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(doc.Cat))) + p(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(doc.Cat))) } - p(fp.WriteString(addEnd(doc.Title))) + p(fp.WriteString(u.AddEnd(doc.Title))) p(fp.WriteString(text)) p(fp.Close()) @@ -252,40 +253,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool return true } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} - -func mkLock(filename string) { - pid := os.Getpid() - link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) - p(os.Symlink(link, filename)) - - name, err := os.Readlink(filename) - p(err) - - if name != link { - p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) - } -} diff --git a/Sargasso/Makefile b/Sargasso/Makefile index 39e8f87..be57db7 100644 --- a/Sargasso/Makefile +++ b/Sargasso/Makefile @@ -3,8 +3,8 @@ all: \ metadata \ sargasso -xml2txt: cmd/xml2txt/*.go - go build -o $@ $^ +xml2txt: cmd/xml2txt/*.go ../internal/util/*.go + go build -o $@ $< metadata: cmd/metadata/*.go go build -o $@ $^ diff --git a/Sargasso/cmd/xml2txt/xml2txt.go b/Sargasso/cmd/xml2txt/xml2txt.go index f04246a..34274c8 100644 --- a/Sargasso/cmd/xml2txt/xml2txt.go +++ b/Sargasso/cmd/xml2txt/xml2txt.go @@ -4,6 +4,8 @@ import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "encoding/xml" "fmt" "os" @@ -57,41 +59,17 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat))) + x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) } - x(fp.WriteString(addEnd(fixSpace(item.Title)))) + x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() pp, err := root.Search(`//body//p`) x(err) for _, p := range pp { - x(fp.WriteString(addEnd(fixSpace(p.Content())))) + x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content())))) } x(fp.Close()) } } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} diff --git a/Sikkom/Makefile b/Sikkom/Makefile index 7ec39b7..daea35d 100644 --- a/Sikkom/Makefile +++ b/Sikkom/Makefile @@ -5,5 +5,5 @@ all: \ metadata: cmd/metadata/*.go go build -o $@ $^ -sikkom: cmd/sikkom/*.go - go build -o $@ $^ +sikkom: cmd/sikkom/*.go ../internal/util/*.go + go build -o $@ $< diff --git a/Sikkom/cmd/sikkom/sikkom.go b/Sikkom/cmd/sikkom/sikkom.go index ef831cd..599647a 100644 --- a/Sikkom/cmd/sikkom/sikkom.go +++ b/Sikkom/cmd/sikkom/sikkom.go @@ -4,6 +4,8 @@ import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "encoding/xml" "fmt" "html" @@ -11,7 +13,6 @@ import ( "net/http" "net/url" "os" - "path/filepath" "strings" "time" ) @@ -64,7 +65,7 @@ func main() { }() myLock := "/net/corpora/nlnieuws/Sikkom/lock" - mkLock(myLock) + u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() @@ -220,49 +221,12 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n fp, err = os.Create(filename + ".txt") p(err) - p(fp.WriteString(addEnd(fixSpace(title)))) + p(fp.WriteString(u.AddEnd(u.FixSpace(title)))) for _, p1 := range pp { - p(fp.WriteString(addEnd(fixSpace(p1.Content())))) + p(fp.WriteString(u.AddEnd(u.FixSpace(p1.Content())))) } p(fp.Close()) return true } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} - -func mkLock(filename string) { - pid := os.Getpid() - link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) - p(os.Symlink(link, filename)) - - name, err := os.Readlink(filename) - p(err) - - if name != link { - p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) - } -} diff --git a/Tzum/Makefile b/Tzum/Makefile index 127b1a9..6df7f18 100644 --- a/Tzum/Makefile +++ b/Tzum/Makefile @@ -3,8 +3,8 @@ all: \ metadata \ tzum -xml2txt: cmd/xml2txt/*.go - go build -o $@ $^ +xml2txt: cmd/xml2txt/*.go ../internal/util/*.go + go build -o $@ $< metadata: cmd/metadata/*.go go build -o $@ $^ diff --git a/Tzum/cmd/xml2txt/xml2txt.go b/Tzum/cmd/xml2txt/xml2txt.go index 97c0e21..0a6d144 100644 --- a/Tzum/cmd/xml2txt/xml2txt.go +++ b/Tzum/cmd/xml2txt/xml2txt.go @@ -4,6 +4,8 @@ import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "encoding/xml" "fmt" "os" @@ -57,9 +59,9 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat))) + x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) } - x(fp.WriteString(addEnd(fixSpace(item.Title)))) + x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() @@ -68,33 +70,9 @@ func main() { for _, p := range pp { s := p.Content() if !strings.Contains(s, "verscheen eerst op Tzum.") { - x(fp.WriteString(addEnd(fixSpace(p.Content())))) + x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content())))) } } x(fp.Close()) } } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} diff --git a/VRT/Makefile b/VRT/Makefile index 10bfb0f..472c384 100644 --- a/VRT/Makefile +++ b/VRT/Makefile @@ -5,5 +5,5 @@ all: \ metadata: cmd/metadata/*.go go build -o $@ $^ -vrt: cmd/vrt/*.go - go build -o $@ $^ +vrt: cmd/vrt/*.go ../internal/util/*.go + go build -o $@ $< diff --git a/VRT/cmd/vrt/vrt.go b/VRT/cmd/vrt/vrt.go index 84defda..258c36c 100644 --- a/VRT/cmd/vrt/vrt.go +++ b/VRT/cmd/vrt/vrt.go @@ -4,6 +4,8 @@ import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + u "git.web.rug.nl/p209327/nlnieuws/internal/util" + "bytes" "encoding/xml" "fmt" @@ -11,7 +13,6 @@ import ( "net/http" "net/url" "os" - "path/filepath" "strings" "time" ) @@ -78,7 +79,7 @@ func main() { }() myLock := "/net/corpora/nlnieuws/VRT/lock" - mkLock(myLock) + u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() @@ -242,18 +243,18 @@ func doArticle(filename string, url string, title string, tags []string, cats [] p(fmt.Fprintln(&buf, "##META text cat =")) } else { for _, cat := range cats { - p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat))) + p(fmt.Fprintf(&buf, "##META text cat = %s\n", u.FixSpace(cat))) } } if len(tags) == 0 { p(fmt.Fprintln(&buf, "##META text tag =")) } else { for _, tag := range tags { - p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag))) + p(fmt.Fprintf(&buf, "##META text tag = %s\n", u.FixSpace(tag))) } } - _, err = buf.WriteString(addEnd(fixSpace(title))) + _, err = buf.WriteString(u.AddEnd(u.FixSpace(title))) p(err) fouten := make([]string, 0) @@ -262,7 +263,7 @@ func doArticle(filename string, url string, title string, tags []string, cats [] pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//*[contains(@class,"prose-article-body-r")]`) p(err) for _, p1 := range pp { - p(fmt.Fprint(&buf, addEnd(fixSpace(p1.Content())))) + p(fmt.Fprint(&buf, u.AddEnd(u.FixSpace(p1.Content())))) found = true } if !found { @@ -277,7 +278,7 @@ func doArticle(filename string, url string, title string, tags []string, cats [] `//div[@data-sentry-component="ArticleTitle"]//h2`) p(err) for _, p1 := range pp { - p(fmt.Fprint(&buf, addEnd(fixSpace(p1.Content())))) + p(fmt.Fprint(&buf, u.AddEnd(u.FixSpace(p1.Content())))) found = true } if !found { @@ -311,40 +312,3 @@ func doArticle(filename string, url string, title string, tags []string, cats [] return true } - -func addEnd(s string) string { - s = strings.TrimSpace(s) - n := len(s) - if n == 0 { - return "" - } - if n > 0 { - if strings.ContainsAny(s[n-1:], ".!?") { - return s + "\n" - } - } - if n > 1 { - s2 := s[n-2:] - if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { - return s + "\n" - } - } - return s + ".\n" -} - -func fixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") -} - -func mkLock(filename string) { - pid := os.Getpid() - link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) - p(os.Symlink(link, filename)) - - name, err := os.Readlink(filename) - p(err) - - if name != link { - p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) - } -} diff --git a/go.mod b/go.mod index 8bc5a5c..cd67917 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module nlnieuws +module git.web.rug.nl/p209327/nlnieuws go 1.26.1 diff --git a/internal/util/util.go b/internal/util/util.go new file mode 100644 index 0000000..ab7ea8e --- /dev/null +++ b/internal/util/util.go @@ -0,0 +1,41 @@ +package util + +import ( + e "codeberg.org/pebbe/errors" + + "fmt" + "os" + "path/filepath" + "regexp" + "strings" +) + +var ( + p = e.PanicErr + reEOL = regexp.MustCompile(`[.!?]['"”’]?$`) +) + +func AddEnd(s string) string { + s = strings.TrimSpace(s) + if s == "" || reEOL.MatchString(s) { + return s + } + return s + ".\n" +} + +func FixSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} + +func MkLock(filename string) { + pid := os.Getpid() + link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) + p(os.Symlink(link, filename)) + + name, err := os.Readlink(filename) + p(err) + + if name != link { + p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) + } +}