collect.sh: bugfix pubdate

This commit is contained in:
Peter Kleiweg
2026-04-10 18:36:13 +02:00
parent 929207d365
commit dc931deb91
3 changed files with 83 additions and 0 deletions

View File

@@ -48,9 +48,19 @@ do
# tellingen met tags # tellingen met tags
# pubdate soms kan verschillen voor hetzelfde bericht,
# bijvoorbeeld wanneer een oude versie op zaterdag is verschenen
# en een bijgewerkte versie op maandag. Dan valt de nieuwe versie in
# een volgende week (en dus directory) dan de oude versie, en wordt het
# bericht dus opieuw opgeslagen.
# Daarom moet pubdate verwijderd worden uit de metadata, om dubbele
# tellingen te voorkomen.
# Dit speelt alleen(?) bij atom-feeds, zoals van de VRT.
alto \ alto \
'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \ 'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
'tt:%w\t%d\t%I' $files \ 'tt:%w\t%d\t%I' $files \
| sed -e 's/pubdate: "[-0-9]*"//' \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| items2count > $part-nieuwe-namen-$ds-$i | items2count > $part-nieuwe-namen-$ds-$i
top20 $part-nieuwe-namen-$ds-$i top20 $part-nieuwe-namen-$ds-$i
@@ -58,6 +68,7 @@ do
alto \ alto \
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
'tt:%w\t%d\t%I' $files \ 'tt:%w\t%d\t%I' $files \
| sed -e 's/pubdate: "[-0-9]*"//' \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| items2count > $part-nieuwe-woorden-$ds-$i | items2count > $part-nieuwe-woorden-$ds-$i
top20 $part-nieuwe-woorden-$ds-$i top20 $part-nieuwe-woorden-$ds-$i
@@ -65,24 +76,28 @@ do
alto \ alto \
'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"])]' \ 'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"])]' \
'tt:%l\t%d\t%I' $files \ 'tt:%l\t%d\t%I' $files \
| sed -e 's/pubdate: "[-0-9]*"//' \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| items2count > $part-locaties-$ds-$i | items2count > $part-locaties-$ds-$i
alto \ alto \
'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"])]' \ 'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"])]' \
'tt:%l\t%d\t%I' $files \ 'tt:%l\t%d\t%I' $files \
| sed -e 's/pubdate: "[-0-9]*"//' \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| items2count > $part-personen-$ds-$i | items2count > $part-personen-$ds-$i
alto \ alto \
'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"])]' \ 'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"])]' \
'tt:%l\t%d\t%I' $files \ 'tt:%l\t%d\t%I' $files \
| sed -e 's/pubdate: "[-0-9]*"//' \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| items2count > $part-organisaties-$ds-$i | items2count > $part-organisaties-$ds-$i
alto \ alto \
'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"])]' \ 'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"])]' \
'tt:%l\t%d\t%I' $files \ 'tt:%l\t%d\t%I' $files \
| sed -e 's/pubdate: "[-0-9]*"//' \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| items2count > $part-overige-namen-$ds-$i | items2count > $part-overige-namen-$ds-$i

View File

@@ -1,2 +1,4 @@
dit wordt niet meer gebruikt dit wordt niet meer gebruikt
xquery is veel te traag xquery is veel te traag
met xquery doet het er 4.8 keer zo lang over als zonder

66
xquery/new2old.go Normal file
View File

@@ -0,0 +1,66 @@
package main
import (
e "codeberg.org/pebbe/errors"
"bufio"
"encoding/xml"
"fmt"
"os"
"regexp"
"strings"
)
type Item struct {
XMLName xml.Name `xml:"i"`
Msg string `xml:"m"`
Tags []string `xml:"t"`
Word string `xml:"w"`
}
var (
x = e.ExitErr
reTag = regexp.MustCompile(`tag: "((?:\\.|[^\\"])*)"`)
reUnquote = regexp.MustCompile(`\\.`)
)
func main() {
scanner := bufio.NewScanner(os.Stdin)
for scanner.Scan() {
line := scanner.Text()
aa := strings.Split(line, "\t")
item := Item{
// Msg: aa[2][:strings.LastIndex(aa[2], ".")],
Msg: aa[2],
Word: aa[0],
Tags: make([]string, 0),
}
for _, tag := range parseTags(aa[1]) {
item.Tags = append(item.Tags, tag)
}
b, err := xml.Marshal(item)
x(err)
fmt.Println(
strings.ReplaceAll(
strings.ReplaceAll(string(b), "'", "'"),
""", `"`))
}
x(scanner.Err())
}
func parseTags(s string) []string {
tags := make([]string, 0)
aa := reTag.FindAllStringSubmatch(s, -1)
for _, a := range aa {
tags = append(tags, unquote(a[1]))
}
return tags
}
func unquote(text string) string {
return reUnquote.ReplaceAllStringFunc(text, func(s string) string {
return s[1:]
})
}