1
0
mirror of https://github.com/msberends/AMR.git synced 2025-01-30 23:44:39 +01:00
AMR/data-raw/_generate_GPT_knowledge_input.sh

82 lines
3.4 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Check if the current directory is named 'AMR'
if [ "$(basename "$PWD")" != "AMR" ]; then
echo "Error: The script must be run from the 'AMR' directory."
exit 1
fi
rm -rf data-raw/gpt_training_text_v*
# Define the output file, located in ./data-raw
version="$1"
output_file="data-raw/gpt_training_text_v${version}.txt"
# Clear the output file if it exists
echo "This knowledge base contains all context you must know about the AMR package for R. You are a GPT trained to be an assistant for the AMR package in R. You are an incredible R specialist, especially trained in this package and in the tidyverse." > "$output_file"
echo "" >> "$output_file"
echo "First and foremost, you are trained on version ${version}. Remember this whenever someone asks which AMR package version youre at." >> "$output_file"
echo "" >> "$output_file"
echo "Below are the contents of the `NAMESPACE` file, the `index.md` file, and all the `man/*.Rd` files (documentation) in the package. Every file content is split using 100 hypens." >> "$output_file"
echo "----------------------------------------------------------------------------------------------------" >> "$output_file"
echo "" >> "$output_file"
# Function to remove header block (delimited by # ======)
remove_header() {
sed '/# =\{6,\}/,/# =\{6,\}/d' "$1"
}
# # Process all .R files in the 'R' folder
# for file in R/*.R; do
# echo "--------------------------------------------------" >> "$output_file"
# echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
# echo -e "\n" >> "$output_file"
# remove_header "$file" >> "$output_file"
# echo -e "\n\n" >> "$output_file"
# done
# Process important metadata files (DESCRIPTION, NAMESPACE, index.md)
for file in NAMESPACE index.md; do
if [[ -f $file ]]; then
echo "----------------------------------------------------------------------------------------------------" >> "$output_file"
echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
echo -e "\n" >> "$output_file"
cat "$file" >> "$output_file"
echo -e "\n\n" >> "$output_file"
fi
done
# Process all .Rd files from the 'man' folder
for file in man/*.Rd; do
echo "----------------------------------------------------------------------------------------------------" >> "$output_file"
echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
echo -e "\n" >> "$output_file"
remove_header "$file" >> "$output_file"
echo -e "\n\n" >> "$output_file"
done
# Process all .Rmd files in the 'vignettes' folder
for file in vignettes/*.Rmd; do
echo "----------------------------------------------------------------------------------------------------" >> "$output_file"
echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
echo -e "\n" >> "$output_file"
remove_header "$file" >> "$output_file"
echo -e "\n\n" >> "$output_file"
done
# Process README.md
# echo "THE PART HEREAFTER CONTAINS THE README OF OUR PYTHON PACKAGE" >> "$output_file"
# echo -e "\n" >> "$output_file"
# for file in PythonPackage/AMR/README.md; do
# remove_header "$file" >> "$output_file"
# echo -e "\n\n" >> "$output_file"
# done
# Process test files (if available) in the 'tests' folder
# for file in tests/*.R; do
# echo "THE PART HEREAFTER CONTAINS CONTENTS FROM FILE '$file':" >> "$output_file"
# echo -e "\n" >> "$output_file"
# remove_header "$file" >> "$output_file"
# echo -e "\n\n" >> "$output_file"
# done