brain-assistant
/
yet-another-applied-llm-benchmark


			
							123456789101112131415161718192021222324252627282930313233
							from evaluator import *

DESCRIPTION = "Test if a model can run an incremental update of a bash command without overwriting files that already exist"

TAGS = ['bash']

question = 'Run find . -name "*.pdf" -exec python3 process.py {} txt/{}.txt \;  but don\'t rewrite files that already exist. Instead of .pdf.txt make it just .txt'


def setup_fn():
    open("process.py", "w").write("import sys\nopen(sys.argv[2], 'w').write(open(sys.argv[1]).read())\n")
    open("a.pdf", "w").write("hello1")
    open("b.pdf", "w").write("hello2")
    open("c.pdf", "w").write("hello3")
    import os
    os.mkdir("txt")
    open("txt/a.txt", "w").write("done1")
    open("txt/c.txt", "w").write("done3")


def check_ok():
    import os
    return open("txt/a.txt").read() == "done1" and open("txt/b.txt").read() == "hello2" and open("txt/c.txt").read() == "done3" and len(os.listdir("txt")) == 3


TestBashIncrementalUpdate = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode(keep_main=True) >> BashRun() >> PyEvaluator(check_ok)


if __name__ == "__main__":
    print(run_test(TestBashIncrementalUpdate))