brain-assistant
/
yet-another-applied-llm-benchmark


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
							from evaluator import *

DESCRIPTION = "Test if the model can guide a user in a conversation to setup a git repo."

TAGS = ['bash', "git", "agent"]

question = """
You are in a repository with two files `a` and `b`. Make a new git repo and then commit them both.

On each round you should just give me just one command to type, I'll run it, and give you the exact output. I'll get started:

INPUT: `pwd`

OUTPUT: `/usr/src/app`

INPUT: `ls`

OUTPUT: `main.py a b`

Now your turn. What do you want to run? Do not explain yourself, just give me exactly one command. Start your response with INPUT:"""


def test_ok():
    import os
    success = os.popen("git rev-list --all --count").read().strip() == "1"
    return success

def setup():
    import os
    open("a", "w").write("a")
    open("b", "w").write("b")
    os.popen('git config --global user.email "you@example.com"').read()
    os.popen('git config --global user.name "Your Name"').read()

def do_extract(x):
    if 'INPUT:' in x:
        x = x.split("INPUT:")[1]
    try:
        to_send = x.split("`")[1]
    except:
        to_send = x.split("\n")[0]
    return to_send

def do_prepare(x):
    print("Preparing to pass back", x)
    return f"OUTPUT: `{x}`.\n\nWhat command should I run next? Start your response with INPUT:"
    
TestGitSimple = Setup(setup) >> question >> UntilDone(PyEvaluator(test_ok), (LLMConversation() >> PyFunc(do_extract) >> TerminalRun() >> PyFunc(do_prepare)), max_iters=4) >> PyEvaluator(test_ok)
    

if __name__ == "__main__":
    print(run_test(TestGitSimple))