Setup code Environment¶
We need to setup a separate sandbox environment, in which the code gets executed. In the root directory, run:
python -m venv .ds1000env
source .ds1000env/bin/activate
pip install -U pip
pip install -e .[ds1000_code_env]
python -c "import torch, pandas as pd; print(torch.__version__, pd.__version__)"
Verify environment exists and create path¶
In [1]:
Copied!
from pathlib import Path
import os
root = Path.cwd()
env_py = root / "../../../.ds1000env" / ("Scripts" if os.name == "nt" else "bin") / ("python.exe" if os.name == "nt" else "python")
print(env_py, env_py.exists())
from pathlib import Path
import os
root = Path.cwd()
env_py = root / "../../../.ds1000env" / ("Scripts" if os.name == "nt" else "bin") / ("python.exe" if os.name == "nt" else "python")
print(env_py, env_py.exists())
/home/samuelkiegeland/genlm-eval/docs/cookbook/domains/../../../.ds1000env/bin/python True
Usage¶
Initialize the dataset and evaluator¶
In [2]:
Copied!
from genlm.eval.domains.ds1000 import (
DS1000Dataset, DS1000Evaluator
)
from genlm.eval.domains.ds1000 import (
DS1000Dataset, DS1000Evaluator
)
/home/samuelkiegeland/micromamba/envs/py311/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
INFO 10-03 17:04:15 [__init__.py:235] Automatically detected platform cuda.
In [3]:
Copied!
dataset = DS1000Dataset.from_hf(
split="test",
libraries=None,
perturbation_types=None,
max_instances=8,
shuffle=False,
)
print("Instances loaded:", len(dataset))
evaluator = DS1000Evaluator(
python_executable=str(env_py),
timeout_seconds=15.0,
extra_env={"PYTHONHASHSEED": "0"},
)
dataset = DS1000Dataset.from_hf(
split="test",
libraries=None,
perturbation_types=None,
max_instances=8,
shuffle=False,
)
print("Instances loaded:", len(dataset))
evaluator = DS1000Evaluator(
python_executable=str(env_py),
timeout_seconds=15.0,
extra_env={"PYTHONHASHSEED": "0"},
)
Instances loaded: 8
Inspect dataset¶
In [4]:
Copied!
first = next(iter(dataset))
print("Instance ID:", first.instance_id)
print("Library:", first.metadata.get("library"))
print("Perturbation:", first.metadata.get("perturbation_type"))
print("Prompt preview:\n", (first.prompt[:500] + "...") if len(first.prompt) > 500 else first.prompt)
first = next(iter(dataset))
print("Instance ID:", first.instance_id)
print("Library:", first.metadata.get("library"))
print("Perturbation:", first.metadata.get("perturbation_type"))
print("Prompt preview:\n", (first.prompt[:500] + "...") if len(first.prompt) > 500 else first.prompt)
Instance ID: 0
Library: Pandas
Perturbation: Origin
Prompt preview:
Problem:
I have the following DataFrame:
Col1 Col2 Col3 Type
0 1 2 3 1
1 4 5 6 1
2 7 8 9 2
3 10 11 12 2
4 13 14 15 3
5 16 17 18 3
The DataFrame is read from a CSV file. All rows which have Type 1 are on top, followed by the rows with Type 2, followed by the rows with Type 3, etc.
I would like to shuffle the order of the DataFrame's rows according to a list. \
For example, give a list [2, 4, 0, 3, 1...
Model Adaptor¶
In [ ]:
Copied!
from genlm.control import PromptedLLM, AWRS
from genlm.eval import ModelOutput, ModelResponse
from genlm.eval.domains.ds1000 import (
DS1000RuntimeNoErrorPotential, default_prompt_formatter
)
# Load an LLM
LLM = PromptedLLM.from_name("meta-llama/Meta-Llama-3-8B", temperature=0.5)
async def model(instance, output_dir, replicate):
# Set the prompt for the LLM.
LLM.prompt_ids = default_prompt_formatter(
LLM.model.tokenizer, instance, use_chat_format=False
)
# Define a potential that ensures the code throws no error
potential = DS1000RuntimeNoErrorPotential(
code_context=instance.code_context,
python_executable=str(env_py),
extra_env={"PYTHONHASHSEED": "0"},
timeout_seconds=15.0,
).coerce(LLM)
# Define an adaptive weighted rejection sampler to sample tokens from the constrained model.
sampler = AWRS(LLM, potential)
# Run SMC to sample sequences from the constrained model.
sequences = await sampler.smc(
n_particles=5,
ess_threshold=0.5,
max_tokens=256,
)
# Return the sampled sequences and their probabilities as a ModelOutput.
return ModelOutput(
responses=[
ModelResponse(response=sequence, weight=prob)
for sequence, prob in sequences.decoded_posterior.items()
],
)
from genlm.control import PromptedLLM, AWRS
from genlm.eval import ModelOutput, ModelResponse
from genlm.eval.domains.ds1000 import (
DS1000RuntimeNoErrorPotential, default_prompt_formatter
)
# Load an LLM
LLM = PromptedLLM.from_name("meta-llama/Meta-Llama-3-8B", temperature=0.5)
async def model(instance, output_dir, replicate):
# Set the prompt for the LLM.
LLM.prompt_ids = default_prompt_formatter(
LLM.model.tokenizer, instance, use_chat_format=False
)
# Define a potential that ensures the code throws no error
potential = DS1000RuntimeNoErrorPotential(
code_context=instance.code_context,
python_executable=str(env_py),
extra_env={"PYTHONHASHSEED": "0"},
timeout_seconds=15.0,
).coerce(LLM)
# Define an adaptive weighted rejection sampler to sample tokens from the constrained model.
sampler = AWRS(LLM, potential)
# Run SMC to sample sequences from the constrained model.
sequences = await sampler.smc(
n_particles=5,
ess_threshold=0.5,
max_tokens=256,
)
# Return the sampled sequences and their probabilities as a ModelOutput.
return ModelOutput(
responses=[
ModelResponse(response=sequence, weight=prob)
for sequence, prob in sequences.decoded_posterior.items()
],
)
Run the evaluation¶
In [ ]:
Copied!
from genlm.eval import run_evaluation
results = await run_evaluation(
dataset=dataset,
model=model,
evaluator=evaluator,
max_instances=2,
n_replicates=1,
verbosity=1,
#output_dir="ds1000_results", #optionally save the results to a directory
)
from genlm.eval import run_evaluation
results = await run_evaluation(
dataset=dataset,
model=model,
evaluator=evaluator,
max_instances=2,
n_replicates=1,
verbosity=1,
#output_dir="ds1000_results", #optionally save the results to a directory
)
Instance instance_id=0 prompt="Problem:\nI have the following DataFrame:\n Col1 Col2 Col3 Type\n0 1 2 3 1\n1 4 5 6 1\n2 7 8 9 2\n3 10 11 12 2\n4 13 14 15 3\n5 16 17 18 3\n\n\nThe DataFrame is read from a CSV file. All rows which have Type 1 are on top, followed by the rows with Type 2, followed by the rows with Type 3, etc.\nI would like to shuffle the order of the DataFrame's rows according to a list. \\\nFor example, give a list [2, 4, 0, 3, 1, 5] and desired result should be:\n Col1 Col2 Col3 Type\n2 7 8 9 2\n4 13 14 15 3\n0 1 2 3 1\n3 10 11 12 2\n1 4 5 6 1\n5 16 17 18 3\n...\n\n\nHow can I achieve this?\n\n\nA:\n<code>\nimport pandas as pd\nimport numpy as np\n\n\ndf = pd.DataFrame({'Col1': [1, 4, 7, 10, 13, 16],\n 'Col2': [2, 5, 8, 11, 14, 17],\n 'Col3': [3, 6, 9, 12, 15, 18],\n 'Type': [1, 1, 2, 2, 3, 3]})\nList = np.random.permutation(len(df))\n</code>\nresult = ... # put solution in this variable\nBEGIN SOLUTION\n<code>" code_context='import pandas as pd\nimport numpy as np\nimport copy\n\n\ndef generate_test_case(test_case_id):\n def generate_ans(data):\n data = data\n df, List = data\n return df.iloc[List]\n\n def define_test_input(test_case_id):\n if test_case_id == 1:\n df = pd.DataFrame(\n {\n "Col1": [1, 4, 7, 10, 13, 16],\n "Col2": [2, 5, 8, 11, 14, 17],\n "Col3": [3, 6, 9, 12, 15, 18],\n "Type": [1, 1, 2, 2, 3, 3],\n }\n )\n List = np.random.permutation(len(df))\n return df, List\n\n test_input = define_test_input(test_case_id)\n expected_result = generate_ans(copy.deepcopy(test_input))\n return test_input, expected_result\n\n\ndef exec_test(result, ans):\n try:\n pd.testing.assert_frame_equal(result, ans, check_dtype=False)\n return 1\n except:\n return 0\n\n\nexec_context = r"""\nimport pandas as pd\nimport numpy as np\ndf, List = test_input\n[insert]\n"""\n\n\ndef test_execution(solution: str):\n code = exec_context.replace("[insert]", solution)\n for i in range(1):\n test_input, expected_result = generate_test_case(i + 1)\n test_env = {"test_input": test_input}\n exec(code, test_env)\n assert exec_test(test_env["result"], expected_result)' metadata={'problem_id': 0, 'library_problem_id': 0, 'library': 'Pandas', 'test_case_cnt': 1, 'perturbation_type': 'Origin', 'perturbation_origin_id': 0} reference_code='def g(df, List):\n return df.iloc[List]\n\nresult = g(df.copy(), List)\n'
Mean weighted accuracy (instance): 1.0
Mean weighted accuracy (total): 1.0
Instance instance_id=1 prompt="Problem:\nI have the following DataFrame:\n Col1 Col2 Col3 Type\n0 1 2 3 1\n1 4 5 6 1\n2 7 8 9 2\n3 10 11 12 2\n4 13 14 15 3\n5 16 17 18 3\n\n\nThe DataFrame is read from a CSV file. All rows which have Type 1 are on top, followed by the rows with Type 2, followed by the rows with Type 3, etc.\nI would like to shuffle the order of the DataFrame's rows according to a list. \nFor example, give a list [2, 4, 0, 3, 1, 5] and desired DataFrame should be:\n Col1 Col2 Col3 Type\n2 7 8 9 2\n4 13 14 15 3\n0 1 2 3 1\n3 10 11 12 2\n1 4 5 6 1\n5 16 17 18 3\n...\nI want to know how many rows have different Type than the original DataFrame. In this case, 4 rows (0,1,2,4) have different Type than origin.\nHow can I achieve this?\n\n\nA:\n<code>\nimport pandas as pd\nimport numpy as np\n\n\ndf = pd.DataFrame({'Col1': [1, 4, 7, 10, 13, 16],\n 'Col2': [2, 5, 8, 11, 14, 17],\n 'Col3': [3, 6, 9, 12, 15, 18],\n 'Type': [1, 1, 2, 2, 3, 3]})\nList = np.random.permutation(len(df))\n</code>\nresult = ... # put solution in this variable\nBEGIN SOLUTION\n<code>" code_context='import pandas as pd\nimport numpy as np\nimport copy\n\n\ndef generate_test_case(test_case_id):\n def generate_ans(data):\n data = data\n df, List = data\n df2 = df.iloc[List].reindex().reset_index(drop=True)\n return (df2.Type != df.Type).sum()\n\n def define_test_input(test_case_id):\n if test_case_id == 1:\n df = pd.DataFrame(\n {\n "Col1": [1, 4, 7, 10, 13, 16],\n "Col2": [2, 5, 8, 11, 14, 17],\n "Col3": [3, 6, 9, 12, 15, 18],\n "Type": [1, 1, 2, 2, 3, 3],\n }\n )\n List = np.random.permutation(len(df))\n return df, List\n\n test_input = define_test_input(test_case_id)\n expected_result = generate_ans(copy.deepcopy(test_input))\n return test_input, expected_result\n\n\ndef exec_test(result, ans):\n try:\n assert result == ans\n return 1\n except:\n return 0\n\n\nexec_context = r"""\nimport pandas as pd\nimport numpy as np\ndf, List = test_input\n[insert]\n"""\n\n\ndef test_execution(solution: str):\n code = exec_context.replace("[insert]", solution)\n for i in range(1):\n test_input, expected_result = generate_test_case(i + 1)\n test_env = {"test_input": test_input}\n exec(code, test_env)\n assert exec_test(test_env["result"], expected_result)' metadata={'problem_id': 1, 'library_problem_id': 1, 'library': 'Pandas', 'test_case_cnt': 1, 'perturbation_type': 'Difficult-Rewrite', 'perturbation_origin_id': 0} reference_code='def g(df, List):\n df2 = df.iloc[List].reindex().reset_index(drop=True)\n return (df2.Type != df.Type).sum()\n\nresult = g(df.copy(), List)\n'
Mean weighted accuracy (instance): 0.0
Mean weighted accuracy (total): 0.5
Yuhang Lai, Chengxi Li, Yiming Wang, Tianyi Zhang, Ruiqi Zhong, Luke Zettlemoyer, Wen-Tau Yih, Daniel Fried, Sida Wang, and Tao Yu. DS-1000: A natural and reliable benchmark for data science code generation. arXiv preprint arXiv:2211.11501, 2022. URL https://arxiv.org/abs/2211.11501