1
0
mirror of https://github.com/msberends/AMR.git synced 2025-01-31 03:44:36 +01:00
AMR/data-raw/_generate_GPT_knowledge_input.sh

82 lines
3.4 KiB
Bash
Raw Normal View History

2024-10-04 15:28:44 +02:00
#!/bin/bash
# Check if the current directory is named 'AMR'
if [ "$(basename "$PWD")" != "AMR" ]; then
echo "Error: The script must be run from the 'AMR' directory."
exit 1
fi
rm -rf data-raw/gpt_training_text_v*
2024-10-04 15:28:44 +02:00
# Define the output file, located in ./data-raw
2024-10-18 10:58:57 +02:00
version="$1"
output_file="data-raw/gpt_training_text_v${version}.txt"
2024-10-04 15:28:44 +02:00
# Clear the output file if it exists
2025-01-27 21:43:10 +01:00
echo "This knowledge base contains all context you must know about the AMR package for R. You are a GPT trained to be an assistant for the AMR package in R. You are an incredible R specialist, especially trained in this package and in the tidyverse." > "$output_file"
echo "" >> "$output_file"
echo "First and foremost, you are trained on version ${version}. Remember this whenever someone asks which AMR package version youre at." >> "$output_file"
2025-01-27 21:43:10 +01:00
echo "" >> "$output_file"
echo "Below are the contents of the `NAMESPACE` file, the `index.md` file, and all the `man/*.Rd` files (documentation) in the package. Every file content is split using 100 hypens." >> "$output_file"
echo "----------------------------------------------------------------------------------------------------" >> "$output_file"
echo "" >> "$output_file"
2024-10-04 15:28:44 +02:00
# Function to remove header block (delimited by # ======)
remove_header() {
sed '/# =\{6,\}/,/# =\{6,\}/d' "$1"
}
2025-01-27 21:43:10 +01:00
# # Process all .R files in the 'R' folder
# for file in R/*.R; do
2025-01-27 21:43:10 +01:00
# echo "--------------------------------------------------" >> "$output_file"
# echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
# echo -e "\n" >> "$output_file"
# remove_header "$file" >> "$output_file"
# echo -e "\n\n" >> "$output_file"
# done
2024-10-04 15:28:44 +02:00
2024-10-04 15:44:05 +02:00
# Process important metadata files (DESCRIPTION, NAMESPACE, index.md)
for file in NAMESPACE index.md; do
2024-10-04 15:28:44 +02:00
if [[ -f $file ]]; then
2025-01-27 21:43:10 +01:00
echo "----------------------------------------------------------------------------------------------------" >> "$output_file"
echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
echo -e "\n" >> "$output_file"
2024-10-04 15:28:44 +02:00
cat "$file" >> "$output_file"
echo -e "\n\n" >> "$output_file"
fi
done
# Process all .Rd files from the 'man' folder
for file in man/*.Rd; do
2025-01-27 21:43:10 +01:00
echo "----------------------------------------------------------------------------------------------------" >> "$output_file"
echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
echo -e "\n" >> "$output_file"
2024-10-04 15:28:44 +02:00
remove_header "$file" >> "$output_file"
echo -e "\n\n" >> "$output_file"
done
# Process all .Rmd files in the 'vignettes' folder
for file in vignettes/*.Rmd; do
2025-01-27 21:43:10 +01:00
echo "----------------------------------------------------------------------------------------------------" >> "$output_file"
echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
echo -e "\n" >> "$output_file"
remove_header "$file" >> "$output_file"
2024-10-04 15:28:44 +02:00
echo -e "\n\n" >> "$output_file"
done
2025-01-27 21:43:10 +01:00
# Process README.md
# echo "THE PART HEREAFTER CONTAINS THE README OF OUR PYTHON PACKAGE" >> "$output_file"
# echo -e "\n" >> "$output_file"
# for file in PythonPackage/AMR/README.md; do
# remove_header "$file" >> "$output_file"
# echo -e "\n\n" >> "$output_file"
# done
# Process test files (if available) in the 'tests' folder
# for file in tests/*.R; do
# echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
# echo -e "\n" >> "$output_file"
# remove_header "$file" >> "$output_file"
# echo -e "\n\n" >> "$output_file"
# done