fix voor zinnen aan elkaar

This commit is contained in:
Peter Kleiweg
2026-05-29 17:22:10 +02:00
parent ca4e7af8fa
commit 14590570ba
18 changed files with 54 additions and 32 deletions

View File

@@ -58,7 +58,7 @@ func main() {
var item Item
x(xml.Unmarshal(b, &item), filename)
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body/p | //body/h2`)

View File

@@ -159,6 +159,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)

View File

@@ -158,6 +158,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)

View File

@@ -155,6 +155,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)

View File

@@ -168,6 +168,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
/*
s := string(body)
ok = true

View File

@@ -63,7 +63,7 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
}
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body//p`)

View File

@@ -69,7 +69,9 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
}
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
x(fp.WriteString(u.FixSpace(item.Text)))
for _, line := range strings.SplitAfter(item.Text, "\n") {
x(fp.WriteString(u.AddEnd(u.FixSpace(line, true))))
}
x(fp.Close())
}
}

View File

@@ -161,6 +161,8 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
s := string(body)
ok := true
i1 := strings.Index(s, `<script type="application/ld+json"`)
@@ -226,22 +228,9 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
// text bevat kopjes zonder punt aan het eind
lines := strings.Split(text, "\n")
for i, line := range lines {
line = u.FixSpace(line)
n := len(line)
if n > 0 {
if strings.ContainsAny(line[n-1:], ".!?") {
continue
lines[i] = u.AddEnd(u.FixSpace(line, true))
}
}
if n > 1 {
s := line[n-2:]
if s == `."` || s == `!"` || s == `?"` {
continue
}
}
lines[i] = line + "."
}
text = strings.Join(lines, "\n") + "\n"
text = strings.Join(lines, "") + "\n"
fp, err := os.Create(filename + ".txt")
p(err)

View File

@@ -154,6 +154,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)

View File

@@ -65,7 +65,7 @@ func main() {
}
}
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body/p`)

View File

@@ -164,6 +164,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body)
p(err)

View File

@@ -64,7 +64,7 @@ func main() {
var item Item
x(xml.Unmarshal(b, &item))
x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
divs, err := root.Search(`//div[@class="donatieformlinks"]`)

View File

@@ -226,7 +226,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
// text bevat kopjes zonder punt aan het eind
lines := strings.Split(doc.Text, "\n")
for i, line := range lines {
lines[i] = u.AddEnd(u.FixSpace(line))
lines[i] = u.AddEnd(u.FixSpace(line, true))
}
text := strings.Join(lines, "") + "\n"
@@ -240,9 +240,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
if strings.HasPrefix(t, "br_") {
continue
}
if strings.HasPrefix(t, "tr_") {
t = t[3:]
}
t = strings.TrimPrefix(t, "tr_")
p(fmt.Fprintf(fp, "##META text tag = %s\n", t))
}
}

View File

@@ -62,7 +62,7 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
}
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body//p`)

View File

@@ -152,6 +152,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
s := string(body)
ok := true

View File

@@ -66,7 +66,7 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
}
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body/p`)

View File

@@ -179,6 +179,8 @@ func doArticle(filename string, url string, title string, tags []string, cats []
p(err)
p(resp.Body.Close())
body = u.HtmlFix(body)
/*
s := string(body)
ok := true

View File

@@ -13,10 +13,19 @@ import (
var (
p = e.PanicErr
reEOL = regexp.MustCompile(`[.!?]['"”’]?$`)
reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}`)
reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}+\.?`)
reLET = regexp.MustCompile(`\p{Lu}`)
reBody = regexp.MustCompile(`<[bB][rR][ /]*>`)
)
func HtmlFix(body []byte) []byte {
return reBody.ReplaceAllLiteral(body, []byte(" "))
}
func HtmlFixString(body string) string {
return reBody.ReplaceAllLiteralString(body, " ")
}
func AddEnd(s string) string {
s = strings.TrimSpace(s)
if s == "" {
@@ -28,12 +37,20 @@ func AddEnd(s string) string {
return s + ".\n"
}
func FixSpace(s string) string {
func FixSpace(s string, opt ...bool) string {
s = strings.Join(strings.Fields(s), " ")
if len(opt) > 0 && opt[0] {
s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string {
if strings.HasSuffix(s1, ".") {
// zoals: v.Chr.
return s1
}
i := reLET.FindStringIndex(s1)[0]
return s1[:i] + " " + s1[i:]
})
}
return s
}