Compare commits

...

2 Commits

Author SHA1 Message Date
Peter Kleiweg
efa301cc4a Tzum: <em>Titel</em> -> "Titel" 2026-06-03 17:32:01 +02:00
Peter Kleiweg
14590570ba fix voor zinnen aan elkaar 2026-05-29 17:22:10 +02:00
18 changed files with 74 additions and 33 deletions

View File

@@ -58,7 +58,7 @@ func main() {
var item Item var item Item
x(xml.Unmarshal(b, &item), filename) x(xml.Unmarshal(b, &item), filename)
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err) x(err)
root := doc.Root() root := doc.Root()
pp, err := root.Search(`//body/p | //body/h2`) pp, err := root.Search(`//body/p | //body/h2`)

View File

@@ -159,6 +159,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body) doc, err := gokogiri.ParseHtml(body)
p(err) p(err)

View File

@@ -158,6 +158,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body) doc, err := gokogiri.ParseHtml(body)
p(err) p(err)

View File

@@ -155,6 +155,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body) doc, err := gokogiri.ParseHtml(body)
p(err) p(err)

View File

@@ -168,6 +168,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
/* /*
s := string(body) s := string(body)
ok = true ok = true

View File

@@ -63,7 +63,7 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
} }
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err) x(err)
root := doc.Root() root := doc.Root()
pp, err := root.Search(`//body//p`) pp, err := root.Search(`//body//p`)

View File

@@ -69,7 +69,9 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag))) x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
} }
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
x(fp.WriteString(u.FixSpace(item.Text))) for _, line := range strings.SplitAfter(item.Text, "\n") {
x(fp.WriteString(u.AddEnd(u.FixSpace(line, true))))
}
x(fp.Close()) x(fp.Close())
} }
} }

View File

@@ -161,6 +161,8 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
s := string(body) s := string(body)
ok := true ok := true
i1 := strings.Index(s, `<script type="application/ld+json"`) i1 := strings.Index(s, `<script type="application/ld+json"`)
@@ -226,22 +228,9 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
// text bevat kopjes zonder punt aan het eind // text bevat kopjes zonder punt aan het eind
lines := strings.Split(text, "\n") lines := strings.Split(text, "\n")
for i, line := range lines { for i, line := range lines {
line = u.FixSpace(line) lines[i] = u.AddEnd(u.FixSpace(line, true))
n := len(line)
if n > 0 {
if strings.ContainsAny(line[n-1:], ".!?") {
continue
}
}
if n > 1 {
s := line[n-2:]
if s == `."` || s == `!"` || s == `?"` {
continue
}
}
lines[i] = line + "."
} }
text = strings.Join(lines, "\n") + "\n" text = strings.Join(lines, "") + "\n"
fp, err := os.Create(filename + ".txt") fp, err := os.Create(filename + ".txt")
p(err) p(err)

View File

@@ -154,6 +154,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body) doc, err := gokogiri.ParseHtml(body)
p(err) p(err)

View File

@@ -65,7 +65,7 @@ func main() {
} }
} }
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err) x(err)
root := doc.Root() root := doc.Root()
pp, err := root.Search(`//body/p`) pp, err := root.Search(`//body/p`)

View File

@@ -164,6 +164,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body) doc, err := gokogiri.ParseHtml(body)
p(err) p(err)

View File

@@ -64,7 +64,7 @@ func main() {
var item Item var item Item
x(xml.Unmarshal(b, &item)) x(xml.Unmarshal(b, &item))
x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err) x(err)
root := doc.Root() root := doc.Root()
divs, err := root.Search(`//div[@class="donatieformlinks"]`) divs, err := root.Search(`//div[@class="donatieformlinks"]`)

View File

@@ -226,7 +226,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
// text bevat kopjes zonder punt aan het eind // text bevat kopjes zonder punt aan het eind
lines := strings.Split(doc.Text, "\n") lines := strings.Split(doc.Text, "\n")
for i, line := range lines { for i, line := range lines {
lines[i] = u.AddEnd(u.FixSpace(line)) lines[i] = u.AddEnd(u.FixSpace(line, true))
} }
text := strings.Join(lines, "") + "\n" text := strings.Join(lines, "") + "\n"
@@ -240,9 +240,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
if strings.HasPrefix(t, "br_") { if strings.HasPrefix(t, "br_") {
continue continue
} }
if strings.HasPrefix(t, "tr_") { t = strings.TrimPrefix(t, "tr_")
t = t[3:]
}
p(fmt.Fprintf(fp, "##META text tag = %s\n", t)) p(fmt.Fprintf(fp, "##META text tag = %s\n", t))
} }
} }

View File

@@ -62,7 +62,7 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
} }
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err) x(err)
root := doc.Root() root := doc.Root()
pp, err := root.Search(`//body//p`) pp, err := root.Search(`//body//p`)

View File

@@ -152,6 +152,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
s := string(body) s := string(body)
ok := true ok := true

View File

@@ -24,6 +24,8 @@ var (
x = e.ExitErr x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`) reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
reEM = regexp.MustCompile(`::EM::.*?::/EM::`)
reTitle = regexp.MustCompile(`^\p{Lu}`)
) )
func main() { func main() {
@@ -66,7 +68,7 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", t)) x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
} }
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + em1(u.HtmlFixString(item.Text)) + `</body></html>`))
x(err) x(err)
root := doc.Root() root := doc.Root()
pp, err := root.Search(`//body/p`) pp, err := root.Search(`//body/p`)
@@ -74,9 +76,26 @@ func main() {
for _, p := range pp { for _, p := range pp {
s := p.Content() s := p.Content()
if !strings.Contains(s, "verscheen eerst op Tzum.") { if !strings.Contains(s, "verscheen eerst op Tzum.") {
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content())))) x(fp.WriteString(em2(u.AddEnd(u.FixSpace(p.Content())))))
} }
} }
x(fp.Close()) x(fp.Close())
} }
} }
func em1(s string) string {
return strings.ReplaceAll(
strings.ReplaceAll(s, "<em>", " ::EM::"),
"</em>",
"::/EM:: ")
}
func em2(s string) string {
return reEM.ReplaceAllStringFunc(s, func(s1 string) string {
s1 = s1[6 : len(s1)-7]
if reTitle.MatchString(s1) {
return `"` + s1 + `"`
}
return s1
})
}

View File

@@ -179,6 +179,8 @@ func doArticle(filename string, url string, title string, tags []string, cats []
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
/* /*
s := string(body) s := string(body)
ok := true ok := true

View File

@@ -13,10 +13,19 @@ import (
var ( var (
p = e.PanicErr p = e.PanicErr
reEOL = regexp.MustCompile(`[.!?]['"”’]?$`) reEOL = regexp.MustCompile(`[.!?]['"”’]?$`)
reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}`) reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}+\.?`)
reLET = regexp.MustCompile(`\p{Lu}`) reLET = regexp.MustCompile(`\p{Lu}`)
reBody = regexp.MustCompile(`<[bB][rR][ /]*>`)
) )
func HtmlFix(body []byte) []byte {
return reBody.ReplaceAllLiteral(body, []byte(" "))
}
func HtmlFixString(body string) string {
return reBody.ReplaceAllLiteralString(body, " ")
}
func AddEnd(s string) string { func AddEnd(s string) string {
s = strings.TrimSpace(s) s = strings.TrimSpace(s)
if s == "" { if s == "" {
@@ -28,12 +37,20 @@ func AddEnd(s string) string {
return s + ".\n" return s + ".\n"
} }
func FixSpace(s string) string { func FixSpace(s string, opt ...bool) string {
s = strings.Join(strings.Fields(s), " ") s = strings.Join(strings.Fields(s), " ")
s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string {
i := reLET.FindStringIndex(s1)[0] if len(opt) > 0 && opt[0] {
return s1[:i] + " " + s1[i:] s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string {
}) if strings.HasSuffix(s1, ".") {
// zoals: v.Chr.
return s1
}
i := reLET.FindStringIndex(s1)[0]
return s1[:i] + " " + s1[i:]
})
}
return s return s
} }