Skip to content

OpenAI

Generates texts with using the OpenAI Batch API.

OpenAIClient

Bases: BatchClient

Source code in dactyl_generation/openai_generation.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
class OpenAIClient(BatchClient):
    def __init__(self, api_key: str) -> None:
        """
        Constructor for OpenAI Client key.

        Args:
            api_key: OpenAI API key.
        """
        super().__init__()
        self.api_key = api_key
        self.client = OpenAI(api_key=api_key)

    @staticmethod
    def create_individual_request(custom_id: str, message_body: dict) -> dict:
        """
        Creates OpenAI REST API request for a single request.

        Args:
            custom_id: Custom ID of request
            message_body: dictionary of a single message. This includes the messages, max_completion_token parameters etc.

        Returns:
            request: individual request formatted for OpenAI REST API.
        """
        request = {CUSTOM_ID: str(custom_id), "method": "POST", "url": "/v1/chat/completions", BODY: message_body}
        return request


    def create_batch_job(self, prompts_df: pd.DataFrame) -> dict:
        """
           Creates batch job of prompts given messages and temperatures.

           Args:
               prompts_df: DataFrame where each row corresponds to an OpenAI API call.

           Returns:
               results: dictionary containing request information
           """
        json_strs = list()
        requests = list()
        records = prompts_df.drop(columns=[CUSTOM_ID]).to_dict("records")
        for i, record in enumerate(records):
            request = OpenAIClient.create_individual_request(prompts_df[CUSTOM_ID].values[i], record)
            requests.append(request)
            json_strs.append(json.dumps(request))
        buffer = BytesIO(("\n".join(json_strs)).encode("utf-8"))
        # with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False) as fp:
        #    fp.write("\n".join(json_strs))
        #    temp_filename = fp.name

        batch_file = self.client.files.create(
            file=buffer,
            purpose="batch"
        )
        #  os.remove(temp_filename)

        batch_job = self.client.batches.create(
            input_file_id=batch_file.id,
            endpoint="/v1/chat/completions",
            completion_window="24h"
        )

        result_file_id = batch_job.id

        return {
            RESULT_FILE_ID: result_file_id,
            INPUT_FILE: requests,
            API_CALL: OPENAI
        }

    def get_batch_job_output(self, file_path: str) -> pd.DataFrame:
        """
        Gets batch job results using saved metadata from a local JSON file.

        Args:
            file_path: local JSON file containing output of the `create_batch_job` function

        Returns:
            df: pandas DataFrame of generations.
        """
        with open(file_path, 'r') as f:
            data = json.load(f)
        batch_job = self.client.batches.retrieve(data[RESULT_FILE_ID])
        result = self.client.files.content(batch_job.output_file_id).content
        df = pd.read_json(BytesIO(result), lines=True)
        responses = df[RESPONSE]
        custom_ids = df[CUSTOM_ID]
        generations = list()
        for response, custom_id in zip(responses, custom_ids):
            generation = dict()
            generation[TEXT] = response[BODY][CHOICES][0][MESSAGE][CONTENT]
            generation[CUSTOM_ID] = custom_id
            generation[TIMESTAMP] = str(datetime.fromtimestamp(response[BODY][CREATED], tz=timezone.utc))
            generations.append(generation)
        generations = pd.DataFrame(generations)
        requests = pd.DataFrame(data[INPUT_FILE])

        generations = generations.merge(requests, on=CUSTOM_ID, how='left')
        return generations

__init__(api_key)

Constructor for OpenAI Client key.

Parameters:

Name Type Description Default
api_key str

OpenAI API key.

required
Source code in dactyl_generation/openai_generation.py
16
17
18
19
20
21
22
23
24
25
def __init__(self, api_key: str) -> None:
    """
    Constructor for OpenAI Client key.

    Args:
        api_key: OpenAI API key.
    """
    super().__init__()
    self.api_key = api_key
    self.client = OpenAI(api_key=api_key)

create_batch_job(prompts_df)

Creates batch job of prompts given messages and temperatures.

Parameters:

Name Type Description Default
prompts_df DataFrame

DataFrame where each row corresponds to an OpenAI API call.

required

Returns:

Name Type Description
results dict

dictionary containing request information

Source code in dactyl_generation/openai_generation.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def create_batch_job(self, prompts_df: pd.DataFrame) -> dict:
    """
       Creates batch job of prompts given messages and temperatures.

       Args:
           prompts_df: DataFrame where each row corresponds to an OpenAI API call.

       Returns:
           results: dictionary containing request information
       """
    json_strs = list()
    requests = list()
    records = prompts_df.drop(columns=[CUSTOM_ID]).to_dict("records")
    for i, record in enumerate(records):
        request = OpenAIClient.create_individual_request(prompts_df[CUSTOM_ID].values[i], record)
        requests.append(request)
        json_strs.append(json.dumps(request))
    buffer = BytesIO(("\n".join(json_strs)).encode("utf-8"))
    # with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False) as fp:
    #    fp.write("\n".join(json_strs))
    #    temp_filename = fp.name

    batch_file = self.client.files.create(
        file=buffer,
        purpose="batch"
    )
    #  os.remove(temp_filename)

    batch_job = self.client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h"
    )

    result_file_id = batch_job.id

    return {
        RESULT_FILE_ID: result_file_id,
        INPUT_FILE: requests,
        API_CALL: OPENAI
    }

create_individual_request(custom_id, message_body) staticmethod

Creates OpenAI REST API request for a single request.

Parameters:

Name Type Description Default
custom_id str

Custom ID of request

required
message_body dict

dictionary of a single message. This includes the messages, max_completion_token parameters etc.

required

Returns:

Name Type Description
request dict

individual request formatted for OpenAI REST API.

Source code in dactyl_generation/openai_generation.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
@staticmethod
def create_individual_request(custom_id: str, message_body: dict) -> dict:
    """
    Creates OpenAI REST API request for a single request.

    Args:
        custom_id: Custom ID of request
        message_body: dictionary of a single message. This includes the messages, max_completion_token parameters etc.

    Returns:
        request: individual request formatted for OpenAI REST API.
    """
    request = {CUSTOM_ID: str(custom_id), "method": "POST", "url": "/v1/chat/completions", BODY: message_body}
    return request

get_batch_job_output(file_path)

Gets batch job results using saved metadata from a local JSON file.

Parameters:

Name Type Description Default
file_path str

local JSON file containing output of the create_batch_job function

required

Returns:

Name Type Description
df DataFrame

pandas DataFrame of generations.

Source code in dactyl_generation/openai_generation.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def get_batch_job_output(self, file_path: str) -> pd.DataFrame:
    """
    Gets batch job results using saved metadata from a local JSON file.

    Args:
        file_path: local JSON file containing output of the `create_batch_job` function

    Returns:
        df: pandas DataFrame of generations.
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
    batch_job = self.client.batches.retrieve(data[RESULT_FILE_ID])
    result = self.client.files.content(batch_job.output_file_id).content
    df = pd.read_json(BytesIO(result), lines=True)
    responses = df[RESPONSE]
    custom_ids = df[CUSTOM_ID]
    generations = list()
    for response, custom_id in zip(responses, custom_ids):
        generation = dict()
        generation[TEXT] = response[BODY][CHOICES][0][MESSAGE][CONTENT]
        generation[CUSTOM_ID] = custom_id
        generation[TIMESTAMP] = str(datetime.fromtimestamp(response[BODY][CREATED], tz=timezone.utc))
        generations.append(generation)
    generations = pd.DataFrame(generations)
    requests = pd.DataFrame(data[INPUT_FILE])

    generations = generations.merge(requests, on=CUSTOM_ID, how='left')
    return generations