diff --git a/README.md b/README.admin.md similarity index 98% rename from README.md rename to README.admin.md index fa95f72..1506d2e 100644 --- a/README.md +++ b/README.admin.md @@ -12,7 +12,7 @@ query.sh ## 1. Verzamelen van berichten -Berichten van NieuwsNL in `NieuwsNL/yyyy/mm/dd/` +Berichten van NieuwsNL in `NieuwsNL/yyyy/mm/dd/` TODO Overigen in `[A-Z]*/yyyy/ww/` (weeknummer) diff --git a/README.user.md b/README.user.md new file mode 100644 index 0000000..1333ed7 --- /dev/null +++ b/README.user.md @@ -0,0 +1 @@ +TODO diff --git a/r/test-count.R b/r/test-count.R index ca35ab2..fb66ce4 100644 --- a/r/test-count.R +++ b/r/test-count.R @@ -1,5 +1,5 @@ -nw <- read.table('data/2026/algemeen-count-per-2026.23-1', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word")) -od <- read.table('data/2026/algemeen-count-per-2026.22-4', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word")) +nw <- read.table('data/2026/algemeen-allewoorden-2026.23-1', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word", "tags")) +od <- read.table('data/2026/algemeen-allewoorden-2026.22-4', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word", "tags")) words <- unique(c(od$word, nw$word)) o <- order(words) words <- words[o] @@ -26,3 +26,8 @@ nieuw[nieuw == 0] <- 0.5 plot(log(oud), log(nieuw)) lines(log(range(oud)), log(range(nieuw))) identify(log(oud), log(nieuw), labels=words) + + +#plot(oud, nieuw) +#lines(range(oud), range(nieuw)) +#identify(oud, nieuw, labels=words)