brain-assistant
/
yet-another-applied-llm-benchmark


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877
							## Copyright (C) 2024, Nicholas Carlini <nicholas@carlini.com>.
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <http://www.gnu.org/licenses/>.

import subprocess
import pickle
import random
import json
import os
import time
import io
import docker
import inspect
import re

import numpy as np

from PIL import Image

import docker_controller
from docker_controller import invoke_docker, DockerJob


## Constants that define which model we're supposed to be using:
LLM = "llm"                         # The LLM under evaluation
EVAL_LLM = "eval_llm"               # A good LLM that can act as a judge
VISION_EVAL_LLM = "vision_eval_llm" # And a good judge for vision tasks
PYTHON_ENV = "python3"              # The version of python to use

class Env:
    """
    An environment that holds the local variables for each test case.
    """

    # The docker object we're running the test in
    docker = None

    # (Optionally, if in unsafe mode, the fake docker object)
    fake_docker_id = None

    # The docker container we're running the tests in
    container = None

    # A DockerJob object, if the test case requires it.
    # These objects allow the test to interact with stdin/out
    # of a process running in the docker container and must be
    # persistant across multiple classes in the test case.
    docker_job = None
    
class Reason:
    """
    A class to keep track of the solution path of a test.
    """
    def __init__(self, node, children):
        self.node = node
        self.children = children

    def __repr__(self):
        return repr((self.node, self.children))
        
    
class Node:
    """
    A node forms the operations in the computation graph for evaluating a test case;
    the most important object in this file. A test case might look like

        Node1 >> Node2 >> (Node3 & Node4)

    Each of these operators that connects nodes return a new node. So this graph
    would be equivalent to writing:

        ThenNode(ThenNode(Node1, Node2), AndNode(Node3, Node4))

    Once the computation graph has been constructed, evaluation is performed by
    calling __call__ on the root node, that then passes off the evalaution process
    as defined by each of the node types.
    """

    def __init__(self, runner):
        """
        Many sub-classes take a single argument, the runner, which is a function
        that should be executed for performing this node's computation.
        """
        self.runner = runner
    
    def setup(self, env, conv, llm, eval_llm, vision_eval_llm):
        """
        Once the graph has been constructed, before running __call__ to evaluate
        the test case, we run setup() on each of the nodes to pass all the
        necessary context. 
        """
        self.env = env
        self.conv = conv
        self.llm = llm
        self.eval_llm = eval_llm
        self.vision_eval_llm = vision_eval_llm

    def __call__(self, orig_output=""):
        """
        Evaluate the test case, starting at this node. This is the main entry
        point for the evaluation process.

        Returns two arguments:
        1. The output of the current node that should be passed to the next node.
        2. A Reason object that explains how the output was generated for debugging.
        
        """
        raise NotImplementedError()
        
    def __rshift__(self, other_node):
        """
        Add the >> operator, which creates a ThenNode.
        Wrap any strings in a StringNode first, to allow for code like

            SetupNode >> "command to run" >> LLMRunNode
        """
        
        if isinstance(other_node, str):
            other_node = StringNode(other_node)
        return ThenNode(self, other_node)
    
    def __rrshift__(self, other_node):
        """
        If a string is the first node, we need to special case the
        rrshift operator, since we can't override the string class.
        Allows the (very common) pattern of

            "command to run" >> LLMRunNode
        """
        if isinstance(other_node, str):
            other_node = StringNode(other_node)
        return ThenNode(other_node, self)
    
    def __and__(self, other_node):
        return AndNode(self, other_node)

    def __or__(self, other_node):
        return OrNode(self, other_node)

    def __invert__(self):
        return NotNode(self)

class StringNode(Node):
    def __init__(self, string):
        """
        A boring node, just returns the string.
        """
        self.string = string

    def __call__(self, orig_output=""):
        """
        Just pass whatever the provided constant string is to the next node.
        """
        yield self.string, Reason(type(self), self.string)
        

class ThenNode(Node):
    """
    Perform two operations in sequence. The output of node1 is passed to node2.
    """
    def __init__(self, node1, node2):
        self.node1 = node1
        self.node2 = node2

    def setup(self, env, conv, llm, eval_llm, vision_eval_llm):
        super().setup(env, conv, llm, eval_llm, vision_eval_llm)
        self.node1.setup(env, conv, llm, eval_llm, vision_eval_llm)
        self.node2.setup(env=env, conv=conv, llm=llm, eval_llm=eval_llm, vision_eval_llm=vision_eval_llm)

    def __call__(self, orig_output=None):
        for output1, response1 in self.node1(orig_output):
            for output2, response2 in self.node2(output1):
                yield output2, Reason(type(self), (response1, response2))

class AndNode(ThenNode):
    """
    An evaluation node that returns true if both outputs are true.
    """
    def __init__(self, node1, node2):
        self.node1 = node1
        self.node2 = node2

    def __call__(self, orig_output):
        for output1, txt1 in self.node1(orig_output):
            for output2, txt2 in self.node2(orig_output):
                yield output1 and output2, Reason(type(self), (txt1, txt2, output1 and output2))

class OrNode(ThenNode):
    """
    An evaluation node that returns true if either outputs are true.
    """
    def __init__(self, node1, node2):
        self.node1 = node1
        self.node2 = node2

    def __call__(self, orig_output):
        for output1, txt1 in self.node1(orig_output):
            for output2, txt2 in self.node2(orig_output):
                yield output1 or output2, Reason(type(self), (txt1, txt2, output1 or output2))
                
class NotNode(Node):
    """
    An evaluation node that negates the prior answer.
    """
    def __init__(self, node1):
        self.node1 = node1

    def setup(self, env, conv, llm, eval_llm, vision_eval_llm):
        super().setup(env, conv, llm, eval_llm, vision_eval_llm)
        self.node1.setup(env, conv, llm, eval_llm, vision_eval_llm)
        
    def __call__(self, orig_output):
        for output1, txt1 in self.node1(orig_output):
            yield not output1, Reason(type(self), [txt1, not output1])

class PyFunc(Node):
    """
    A node that just runs a python function on the prior result.
    If the code crashes then just return an error.
    """
    def __call__(self, x):
        try:
            out = self.runner(x)
            if type(out) == tuple:
                ok, log = out
                return [(ok, Reason(type(self), (log, ok)))]
            else:
                return [(out, Reason(type(self), ("", out)))]
        except:
            return [("", Reason(type(self), ["Error", False]))]

class Echo(Node):
    """
    A no-op node that helps debug test cases by printing whatever's being
    passed along the pipe. Kind of like the Unix tee command.
    """
    def __init__(self):
        pass

    def __call__(self, x):
        print('ECHOING:', x)
        yield x, Reason(type(self), None)
    
class Setup(Node):
    """
    A node that starts up a new Docker environment with a specific setup file.

    Even though the argument is a method, this function needs to be able to
    extract the string representation of that function so it can be executed
    in the context of the docker environment.
    """
    def __call__(self, x):
        docker_controller.setup_docker(self.env)
        code = inspect.getsource(self.runner)
        to_invoke = self.runner.__name__

        code = code + f"\n\n{to_invoke}()"
        out = invoke_docker(self.env, {"setup.py": code.encode()}, [PYTHON_ENV, "setup.py"])

        return [(out, Reason(type(self), None))]

class PyEvaluator(Node):
    """
    A node that runs a python program within the docker environment to judge whether
    or not the test case is solved.

    Even though the argument is a method, this function needs to be able to
    extract the string representation of that function so it can be executed
    in the context of the docker environment.
    """
    def __call__(self, x):
        code = inspect.getsource(self.runner)
        to_invoke = self.runner.__name__

        code = code + f"\n\nprint('final: ' + str({to_invoke}()))"
        out = invoke_docker(self.env, {"check.py": code.encode()}, [PYTHON_ENV, "check.py"])

        return [("final: True" in out, Reason(type(self), [out, "final: True" in out]))]
    

class SubstringEvaluator(Node):
    """
    An evaluation node that checks if a substring is in the output.
    """
    def __init__(self, substr, lower=False):
        self.substr = substr
        self.lower = lower

    def __call__(self, output):
        if self.lower:
            cond = self.substr.lower() in output.lower()
        else:
            cond = self.substr in output
            
        if cond:
            yield True, Reason(type(self), [self.substr, True])
        else:
            yield False, Reason(type(self), [self.substr, False])

class RegexEvaluator(Node):
    """
    An evaluation node that checks if a regex pattern matches the output.
    """
    def __init__(self, pattern, ignore_case=False):
        self.pattern = pattern
        self.ignore_case = ignore_case

    def __call__(self, output):
        import re

        flags = re.IGNORECASE if self.ignore_case else 0
        match = re.search(self.pattern, output, flags)

        if match:
            yield True, Reason(type(self), [self.pattern, True])
        else:
            yield False, Reason(type(self), [self.pattern, False])
            
class ContainsIntEvaluator(Node):
    """
    An evaluation node that checks if a given integer is in the output.
    """
    def __init__(self, num):
        self.num = num

    def __call__(self, output):
        all_integers = re.findall(r'-?[\d,]*\d+\.?\d*', output)
        all_integers = [x.replace(",", "") for x in all_integers]
        if str(self.num) in all_integers:
            yield True, Reason(type(self), [self.num, True])
        else:
            yield False, Reason(type(self), [self.num, False])
            
class EqualEvaluator(Node):
    """
    An evaluation node that checks if the output is equal to a given string.
    """
    def __init__(self, goal):
        self.goal = goal

    def __call__(self, output):
        if self.goal == output:
            yield True, Reason(type(self), [self.goal, True])
        else:
            yield False, Reason(type(self), [self.goal, False])

class UntilDone(Node):
    """
    A node that will loop a specific body node until the condition returns true and it's finished.

    This node is useful when you want a model to, e.g., iterative interact
    with a sqlite database until it's completed some task.
    """
    def __init__(self, cond, body, max_iters=100):
        self.cond = cond
        self.body = body
        self.max_iters = max_iters
        
    def setup(self, env, conv, llm, eval_llm, vision_eval_llm):
        super().setup(env, conv, llm, eval_llm, vision_eval_llm)
        self.cond.setup(env, conv, llm, eval_llm, vision_eval_llm)
        self.body.setup(env, conv, llm, eval_llm, vision_eval_llm)

    def __call__(self, orig_output=None):
        log = []
        for i in range(self.max_iters):
            for output, txt in self.cond(orig_output):
                if output:
                    yield orig_output, Reason(type(self), log)
                    return
            orig_output, partial = next(self.body(orig_output))
            log.append(partial)
        yield orig_output, Reason(type(self), log)
            
class ExtractJSON(Node):
    """
    A node that extracts a JSON object from the response.

    Usually you can just extract the json blob out of the response,
    but if the response contains multiple possible JSON blobs,
    then this node queries the model again asking it for just the JSON.
    """
    def __init__(self):
        pass

    def try_extract(self, output):
        output = output.replace("```json", "```")
        if "```" in output:
            yield output.split("```")[1]
            out1 = "\n".join(output.split("```")[1::2])
            yield out1
        else:
            yield output
        
    def __call__(self, orig_output):
        if orig_output.count("```") == 2:
            for maybe in self.try_extract(orig_output):
                yield maybe, Reason(type(self), [maybe])
        else:
            output = self.llm("Take the below answer to my question asking for a JSON output and just return the JSON object directly, with no other description, so I can copy it into an editor directly:\n" + orig_output)
            for maybe in self.try_extract(output):
                yield maybe, Reason(type(self), [maybe])

class ExtractCode(Node):
    """
    A node that extracts code from the response

    Usually you can just extract the code out of the response,
    but if the response contains multiple possible code objects,
    then this node queries the model again asking it for just the code.
    """
    def __init__(self, keep_main=False, postfix="", manual=None, lang=None):
        self.keep_main = keep_main
        self.postfix = postfix
        self.manual = manual
        self.lang = lang

    def try_extract(self, output):
        output = re.sub('```[a-z]*', '```', output)
        if "```" in output:
            ans = output.split("```")[1] + "\n" + self.postfix
        else:
            ans = output + "\n" + self.postfix
        yield ans
        
    def __call__(self, orig_output):
        if orig_output.count("```") == 2:
            for maybe in self.try_extract(orig_output):
                yield maybe, Reason(type(self), maybe)
            return

        language = ""
        if self.lang is not None:
            language = f"(in {self.lang})"
                
        if self.manual is not None:
            output = self.llm(self.manual.replace("<A>", orig_output))
        elif self.keep_main:
            assert self.postfix == ""
            output = self.llm(f"Take the below answer to my programming question {language} and return just the complete code in a single file so I can copy and paste it into an editor and directly run it. Include any header and main necessary so I can run it by copying this one file. DO NOT MODIFY THE CODE OR WRITE NEW CODE. Here is the code: \n" + orig_output)
        else:
            output = self.llm(f"Take the below answer to my programming question {language} and return just the complete code in a single file so I can copy and paste it into an editor and directly run it. Remove any test cases or example code after the function definition. Remove any main function. I will write those myself. Do include header imports. DO NOT MODIFY THE CODE OR WRITE NEW CODE. Here is the code: \n" + orig_output + ("\nI will be running this code with the following helper functions:\n" + self.postfix if self.postfix else ""))

        for maybe in self.try_extract(output):
            yield maybe, Reason(type(self), maybe)

class MakeFile(Node):
    """
    A node that makes a new file within the docker environment.
    """
    def __init__(self, name):
        self.name = name

    def __call__(self, code):
        out = invoke_docker(self.env, {self.name: code.encode()}, ["echo"])
        yield out, Reason(type(self), (code, out))

class MakeFilesFromJSON(Node):
    """
    A node that makes a new file within the docker environment.
    """
    def __init__(self):
        pass

    def __call__(self, json_str):
        try:
            json_obj = json.loads(json_str)
        except:
            json_obj = {}
            
        for k in json_obj.keys():
            if not isinstance(json_obj[k], bytes):
                json_obj[k] = json_obj[k].encode()

        out = invoke_docker(self.env, json_obj, ["echo"])
        yield out, Reason(type(self), (json_str, out))
        

class PythonRun(Node):
    """
    A node that runs the output from the prior command as a python function.

    Optionally append a set of test cases to the code that's been provided.
    """
    def __init__(self, test_case="", out_bytes=False):
        self.test_case = test_case
        self.out_bytes = out_bytes

    def __call__(self, code):
        code = code + "\n\n" + self.test_case

        out = invoke_docker(self.env, {"main.py": code.encode()}, [PYTHON_ENV, "main.py"], out_bytes=self.out_bytes)
        yield out, Reason(type(self), (code, out))

class SQLRun(Node):
    """
    A node that runs the output from the prior command as a sqlite function.
    """
    def __init__(self):
        pass

    def __call__(self, code):
        out = invoke_docker(self.env, {"run.sql": code.encode()}, ["sqlite3", "-init", "run.sql", "database.db", ".exit"])
        yield out, Reason(type(self), (code, out))
        
class BashRun(Node):
    """
    A node that runs the output from the prior command as a bash script.
    """
    def __init__(self, test_case="", args=[]):
        self.test_case = test_case
        self.args = args

    def __call__(self, code):
        code = code + "\n\n" + self.test_case

        out = invoke_docker(self.env, {"main.sh": code.encode()}, ["bash", "main.sh", *self.args])
        yield out, Reason(type(self), (code, out))

class TerminalRun(Node):
    """
    A node that directly runs a command line argument in the terminal.
    """
    def __init__(self):
        return

    def __call__(self, code):
        if code:
            out = invoke_docker(self.env, {"main.sh": code.encode()}, ["bash", "main.sh"])
        else:
            out = ""
        yield out, Reason(type(self), (code, out))

class RustRun(Node):
    """
    A node that compiles and runs the output Rust code from the prior command.

    Optionally append a set of test cases to the code that's been provided.
    """
    def __init__(self, test_case=""):
        self.test_case = test_case

    def __call__(self, code):
        if 'fn main' in code and 'fn main' in self.test_case:
            code = code.replace('fn main', 'fn __delete_this__main')

        code = code + "\n\n" + self.test_case
            
        out = invoke_docker(self.env, {"main.rs": code.encode(),
                                       "main.sh": "rustc -o a.out main.rs\n./a.out".encode()},
                            ["bash", "main.sh"])
        yield out, Reason(type(self), (code, out))

class CRun(Node):
    """
    A node that runs the output from the prior command as a c function.

    Optionally append a set of test cases to the code that's been provided.
    """
    def __init__(self, test_case="", out_bytes=False, gccflags="", argv=""):
        self.test_case = test_case
        self.out_bytes = out_bytes
        self.gccflags = gccflags
        self.argv = argv

    def __call__(self, code):
        if 'int main' in code and 'int main' in self.test_case:
            code = code.replace('int main', 'int __delete_this__main')

        code = code + "\n\n" + self.test_case
        
        out = invoke_docker(self.env, {"main.c": code.encode(),
                                       "main.sh": f"gcc -o a.out main.c -lm {self.gccflags}\n./a.out {self.argv}".encode()},
                            ["bash", "main.sh"], out_bytes=self.out_bytes)
        yield out, Reason(type(self), (code, out))


class CppRun(Node):
    """
    A node that runs the output from the prior command as a c++ function.

    Optionally append a set of test cases to the code that's been provided.
    """
    def __init__(self, test_case="", out_bytes=False):
        self.test_case = test_case
        self.out_bytes = out_bytes

    def __call__(self, code):
        if 'int main' in code and 'int main' in self.test_case:
            code = code.replace('int main', 'int __delete_this__main')

        code = code + "\n\n" + self.test_case
        
        out = invoke_docker(self.env, {"main.cpp": code.encode(),
                                       "main.sh": "g++ -o a.out main.cpp -lm\n./a.out".encode()},
                            ["bash", "main.sh"], out_bytes=self.out_bytes)
        yield out, Reason(type(self), (code, out))
        

class StartDockerJob(Node):
    """
    Start a new process within the docker container that's termainl interactive.

    This lets us test models that expect to be able to interface with other pieces
    of software by connecting the llm to stdin and stdout, sending data to the
    program and then reading the output back.
    """
    def __init__(self, command, eos_string):
        self.command = command
        self.eos_string = eos_string

    def __call__(self, text):
        self.env.docker_job = DockerJob(self.env.container.id if 'id' in dir(self.env.container) else self.env.container, self.eos_string)
        out = self.env.docker_job(self.command)

        yield out, Reason(type(self), (text, out))

class SendStdoutReceiveStdin(Node):
    """
    This node takes a given piece of text and sends it to the stdin of whatever
    the current running DockerJob is. It then waits for the running process to handle
    this input, and returns the output that the DockerJob returned from stdout.
    """
    def __init__(self):
        pass

    def __call__(self, text):
        out = self.env.docker_job(text)
        yield out, Reason(type(self), (out,))

            
class LLMRun(Node):
    """
    A node to invoke a language model on any given text.

    This is the core function that allows us to evaluate the capabilities of any model.
    """
    def __init__(self, check_prompt="<A>", llm=LLM, json=False):
        self.check_prompt = check_prompt
        self.which_llm = llm
        self.json = json

    def __call__(self, output):
        llm = getattr(self, self.which_llm)
        to_send = self.check_prompt.replace("<A>", output)
        out = llm(to_send, json=self.json)
            
        yield out, Reason(type(self), (to_send, out))

class LLMConversation(Node):
    """
    A node to invoke a language model on any given text, but keeps state.

    This node allows us to send messages that refer to prior messages, whereas
    LLMRun is just a stateless operation.
    """
    def __init__(self, check_prompt="<A>"):
        self.check_prompt = check_prompt

    def __call__(self, output):
        to_send = self.check_prompt.replace("<A>", output)
        out = self.conv(to_send)
        yield out, Reason(type(self), (to_send, out))

class SeleniumDraw(Node):
    """
    A node that creates a new HTML page, renders it in chrome, and then
    captures the output with Selenium.
    """
    def __init__(self):
        pass

    def __call__(self, code):
        try:
        #if 1:
            from selenium import webdriver
            from selenium.webdriver.chrome.options import Options
            
            chrome_options = Options()
            #chrome_options.add_argument("--headless")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
    
            r = random.randint(0, 1000000)
            
            open("/tmp/a%r.html"%r, "w").write(code)
    
            url = 'file:///tmp/a%d.html'%r
    
            browser = webdriver.Chrome(options=chrome_options)
            browser.get(url)
    
            time.sleep(2)
    
            screenshot_path = '/tmp/a%d.png'%r
            browser.save_screenshot(screenshot_path)
    
            browser.quit()
    
            time.sleep(1)
    
            img = Image.open(screenshot_path).convert('RGB')
    
            # get png data
            img_data = io.BytesIO()
            img.save(img_data, format="PNG")
            img_data.seek(0)
            img_data = img_data.read()
            
            
            yield img_data, Reason(type(self), img_data)

        #try:
            pass
    
        except:
            yield b"", Reason(type(self), b"")
        

class JSONSubsetEvaluator(Node):
    def __init__(self, goal):
        self.goal = goal
        
    def check(self, goal, output):
        if isinstance(goal, dict) and isinstance(output, dict):
            # Iterate over all key-value pairs in the goal dictionary
            for key, value in goal.items():
                # Check if the key is present in the output
                if key not in output:
                    return False
                # If the value is a dict or list, recursively check
                if isinstance(value, (dict, list)):
                    if not self.check(value, output[key]):
                        return False
                # Otherwise, check if the value matches
                elif output[key] != value:
                    return False
        elif isinstance(goal, list) and isinstance(output, list):
            # Check each element in the goal list
            for item in goal:
                if item not in output:
                    return False, Reason(self, ["Item not present", item])
        else:
            # Not a dict or list, so check if the values are equal
            if goal == output:
                return True
            else:
                return False
    
        # todo better error message
        return True
        
    def __call__(self, output):
        try:
            output = json.loads(output)
        except:
            yield False, Reason(type(self), [self.goal, False])
            return

        ok = self.check(self.goal, output)
        yield ok, Reason(type(self), [self.goal, ok])

class LLMVisionRun(Node):
    """
    A node to evalaute an image output from a prior operation. Invokes the
    vision evaluation model.
    """
    def __init__(self, check_prompt="<A>", llm=VISION_EVAL_LLM):
        self.check_prompt = check_prompt
        self.which_llm = llm

    def __call__(self, output):
        llm = getattr(self, self.which_llm)
        try:
            if isinstance(output, bytes):
                img = Image.open(io.BytesIO(output))
            else:
                img = output
            out = llm(self.check_prompt, add_image=img, max_tokens=512)
        except Exception as e:
            out = str(e)
        yield out, Reason(type(self), (self.check_prompt, out))

class Conversation:
    """
    An object that keeps track of the conversation history between the
    model and the test case prior questions/steps.
    """
    def __init__(self, llm,preample = ''):
        self.llm = llm
        self.history = []
        self.preample = preample

    def __call__(self, msg):
        if len(self.history)==0:
            msg = self.preample + msg        
        self.history.append(msg)
        output = self.llm(self.history)
        self.history.append(output)
        return output

    def __repr__(self):
        return "Conversation(" + repr(self.history) + ")"

def run_test(test):
    """
    A helper function to run just one specific test case.
    Used to debug tests by running each file directly.
    """
    from llm import llm, eval_llm, vision_eval_llm
    env = Env()
    test.setup(env, Conversation(llm), llm, eval_llm, vision_eval_llm)

    ok = False
    for success, output in test():
        if success:
            ok = True
            break

    import create_results_html
    fmt = create_results_html.format_markdown(output)
    while '\n\n' in fmt:
        fmt = fmt.replace('\n\n', '\n')
    fmt = fmt.replace("\n#", "\n\n#")
    print(fmt)
        
    if env.container:
        docker_controller.async_kill_container(env.docker, env.container)

    return ok
    

def make_python_test(q_and_a, header=""):
    qs = [header]
    
    for q, a in q_and_a:
        qs.append(f"""
answer = {q}
expected = {a}
assert answer == expected, f'Wrong answer; got {{answer}} instead of {{expected}}'""")
    qs.append("print('All tests passed')")

    return "\n".join(qs), "All tests passed"
    

def make_c_test(q_and_a, header="", extra_methods=""):
    qs = []

    qs.append("#include<stdio.h>\n#include<stdlib.h>\n" + extra_methods + "\nint main() {")
    qs.append(header)
    for q, a in q_and_a:
        qs.append(f"""
int answer = {q};
int expected = {a};
if (answer != expected) {{
    printf("Wrong answer; got %d instead of %d.\\n", answer, expected);
    exit(1);
}}""")
    qs.append('printf("All tests passed\\n");')

    qs.append("}");
    
    return "\n".join(qs), "All tests passed"