brain-assistant
/
yet-another-applied-llm-benchmark


			
							12345678910111213141516171819202122
							from evaluator import *

DESCRIPTION = "Test if the model can write a C program that draws an image. This test requires the ability to understand the .bmp specification, and draw a flag that can be correctly parsed and seen by the evaluator."

TAGS = ['code', 'c', 'visual']

question = """
Write me a C program that emits a .bmp that's a depiction of an american flag to stdout.
"""

TestFlagDrawBMP = question >> LLMRun() >> ExtractCode(keep_main=True) >> CRun(out_bytes=True) >> LLMVisionRun("What flag is shown in this image?") >> (SubstringEvaluator("United States") | SubstringEvaluator("USA") | SubstringEvaluator("America"))


question2 = """
Write me a C program that emits a .bmp that's a depiction of an italian flag to stdout.
"""

TestEasyFlagDrawBMP = question2 >> LLMRun() >> ExtractCode(keep_main=True) >> CRun(out_bytes=True) >> LLMVisionRun("What flag is shown in this image?") >> (SubstringEvaluator("Italy") | SubstringEvaluator("Italian"))


if __name__ == "__main__":
    print(run_test(TestFlagDrawBMP))