1
0
mirror of https://github.com/msberends/AMR.git synced 2025-01-14 00:11:50 +01:00
AMR/data-raw/_generate_GPT_knowledge_input.sh

75 lines
2.5 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Check if the current directory is named 'AMR'
if [ "$(basename "$PWD")" != "AMR" ]; then
echo "Error: The script must be run from the 'AMR' directory."
exit 1
fi
rm -rf data-raw/gpt_training_text_v*
# Define the output file, located in ./data-raw
version="$1"
output_file="data-raw/gpt_training_text_v${version}.txt"
# Clear the output file if it exists
echo "This files contains all context you must know about the AMR package for R." > "$output_file"
echo "First and foremost, you are trained on version ${version}. Remember this whenever someone asks which AMR package version youre at." >> "$output_file"
echo "--------------------------------" >> "$output_file"
echo "" >> "$output_file"
# Function to remove header block (delimited by # ======)
remove_header() {
sed '/# =\{6,\}/,/# =\{6,\}/d' "$1"
}
# Process all .R files in the 'R' folder
# for file in R/*.R; do
# echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
# echo -e "\n" >> "$output_file"
# remove_header "$file" >> "$output_file"
# echo -e "\n\n" >> "$output_file"
# done
# Process important metadata files (DESCRIPTION, NAMESPACE, index.md)
for file in NAMESPACE index.md; do
if [[ -f $file ]]; then
echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
echo -e "\n" >> "$output_file"
cat "$file" >> "$output_file"
echo -e "\n\n" >> "$output_file"
fi
done
# Process all .Rd files from the 'man' folder
for file in man/*.Rd; do
echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
echo -e "\n" >> "$output_file"
remove_header "$file" >> "$output_file"
echo -e "\n\n" >> "$output_file"
done
# Process all .Rmd files in the 'vignettes' folder
for file in vignettes/*.Rmd; do
echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
echo -e "\n" >> "$output_file"
remove_header "$file" >> "$output_file"
echo -e "\n\n" >> "$output_file"
done
# Process all .Rmd files in the 'vignettes' folder
echo "THE PART HEREAFTER CONTAINS THE README OF OUR PYTHON PACKAGE" >> "$output_file"
echo -e "\n" >> "$output_file"
for file in PythonPackage/AMR/README.md; do
remove_header "$file" >> "$output_file"
echo -e "\n\n" >> "$output_file"
done
# Process test files (if available) in the 'tests' folder
# for file in tests/*.R; do
# echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
# echo -e "\n" >> "$output_file"
# remove_header "$file" >> "$output_file"
# echo -e "\n\n" >> "$output_file"
# done