brain-assistant
/
yet-another-applied-llm-benchmark


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
							from evaluator import *

DESCRIPTION = "Test if the model can write a Python script that merges a list of file paths into 16 files of approximately equal size."

TAGS = ['code', 'python']

question_easy = """
I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
"""

question = """
I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
"""


def setup_fn():
    import random
    import os
    os.mkdir("/tmp/a")
    with open("/tmp/files", "w") as f:
        for i in range(1000):
            f.write("/tmp/a/file_%d\n" % i)
            with open("/tmp/a/file_%d" % i, "w") as f2:
                if i < 5:
                    f2.write("x" * random.randint(10000, 30000))
                else:
                    f2.write("x" * random.randint(1, 1000))
    
def test():
    import os
    import numpy as np
    sz = []
    for i in range(16):
        sz.append(len(open("/tmp/merged_%d" % i).read()))

    size_before = sum(os.path.getsize("/tmp/a/file_%d"%i) for i in range(1000))
    size_after = sum(sz)

    # I ddn't say exactly how to merge, adding newlines is okay
    if abs(size_before-size_after) > 2000:
        return False

    # easy to hit with proper merging, hard otherwise
    # 10 trials suggest the range is usualy [10, 40]
    return max(sz) - min(sz) < 200
            
                    
TestMake16FilesEasy = Setup(setup_fn) >> question_easy >> LLMRun() >> ExtractCode(keep_main=True) >> Echo() >> PythonRun() >> PyEvaluator(test)

TestMake16Files = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode(keep_main=True) >> Echo() >> PythonRun() >> PyEvaluator(test)


if __name__ == "__main__":
    print(run_test(TestMake16FilesEasy))