Skip to content

Autodrive

dadosfera.services.autodrive.create_vector_dataset

create_vector_dataset(base_url, filepaths, ocr_method='common')

Uploads multiple files to a dataset creation service and monitors until the dataset is ready.

Args: - base_url (str): The base URL of the service. - filepaths (List[str]): A list of file paths to be uploaded. - ocr_method (Literal['premium','common'])

Returns: - dict: A dictionary containing the dataset details. Typically this would include a dataset_id and the status of the creation.

Raises: - HTTPError: If any of the HTTP requests return a non-200 status code.

Source code in dadosfera/services/autodrive.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def create_vector_dataset(base_url: str, filepaths: List[str],ocr_method: Literal['premium','common']='common'):
    """
    Uploads multiple files to a dataset creation service and monitors until the dataset is ready.

    Args:
    - base_url (str): The base URL of the service.
    - filepaths (List[str]): A list of file paths to be uploaded.
    - ocr_method (Literal['premium','common'])

    Returns:
    - dict: A dictionary containing the dataset details. Typically this would include a dataset_id and the status of the creation.

    Raises:
    - HTTPError: If any of the HTTP requests return a non-200 status code.
    """

    files = [
        ("files", (file_path.split("/")[-1], open(file_path, "rb")))
        for file_path in filepaths
    ]
    data = {'ocr_method': ocr_method}

    authorization = create_basic_auth(username = 'admin', password = extract_id(base_url))

    # Header
    headers = {
        'Authorization': authorization,
    }

    response = requests.post(
        url=f"{base_url}/upload",
        headers = headers,
        files=files,
        data=data
    )

    if response.status_code != 200:
        logger.error(response.content)
        response.raise_for_status()

    dataset_id = response.json()["dataset_id"]

    terminal_states = ["success", "failed"]
    sleep_interval = 60 if ocr_method == 'premium' else 5
    logger.info("Checking if dataset is ready")
    state = None
    while True:
        logger.info(f"Dataset {dataset_id} not ready yet, waiting some seconds.")
        response = requests.get(f"{base_url}/dataset/{dataset_id}")
        if response.status_code != 200:
            response.raise_for_status()

        if response.status_code == 200:
            state = response.json()["status"]

            if state in terminal_states:
                logger.info(f"Response: {response.json()}")
                return response.json(), dataset_id

        logger.info(f'Response not ready yet, awaiting {sleep_interval}')
        sleep(sleep_interval)

dadosfera.services.autodrive.ask_question_to_dataset

ask_question_to_dataset(base_url, dataset_id, question, metadata_filter=None, distance_metric='cos', maximize_marginal_relevance=True, fetch_k=10, k=3)

Asks a question to a previously created dataset and monitors until an answer is ready.

Args: - base_url (str): The base URL of the service. - dataset_id (str): The unique identifier of the dataset. - question (str): The question string.

Returns: - dict: A dictionary containing the answer details. Typically this would include a question_id and the status of the question.

Raises: - HTTPError: If any of the HTTP requests return a non-200 status code.

Source code in dadosfera/services/autodrive.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def ask_question_to_dataset(
        base_url: str,
        dataset_id: str,
        question: str,
        metadata_filter: Optional[Dict] = None,
        distance_metric: Literal["cos","L2","L1","max","dot"] = "cos",
        maximize_marginal_relevance: bool = True,
        fetch_k: int = 10,
        k: int = 3
    ):
    """
    Asks a question to a previously created dataset and monitors until an answer is ready.

    Args:
    - base_url (str): The base URL of the service.
    - dataset_id (str): The unique identifier of the dataset.
    - question (str): The question string.

    Returns:
    - dict: A dictionary containing the answer details. Typically this would include a question_id and the status of the question.

    Raises:
    - HTTPError: If any of the HTTP requests return a non-200 status code.
    """
    authorization = create_basic_auth(username = 'admin', password = extract_id(base_url))

    # Header
    headers = {
        'Authorization': authorization,
    }

    response = requests.post(
        url=f"{base_url}/dataset/{dataset_id}/question",
        headers={"Content-Type": "application/json", **headers},
        data=json.dumps({
            "question": question,
            "metadata_filter": metadata_filter,
            "distance_metric": distance_metric,
            "maximize_marginal_relevance": maximize_marginal_relevance,
            "fetch_k": fetch_k,
            "k": k
        })
    )

    if response.status_code != 200:
        logger.error(response.content)
        response.raise_for_status()

    question_id = response.json()["question_id"]

    terminal_states = ["success", "failed"]
    logger.info("Checking if question is ready")
    state = None
    while True:
        logger.info(f"Question {question_id} not ready yet, waiting some seconds.")
        response = requests.get(
            f"{base_url}/dataset/{dataset_id}/question/{question_id}",
            headers={"Content-Type": "application/json", **headers}
            )
        if response.status_code != 200:
            response.raise_for_status()

        state = response.json()["status"]

        if state in terminal_states:
            logger.info(f"Response: {response.json()}")
            return response.json()

        sleep(5)

dadosfera.services.autodrive.ask_question_to_dataset_using_ai

ask_question_to_dataset_using_ai(base_url, dataset_id, question, metadata_filter=None, distance_metric='cos', maximize_marginal_relevance=True, fetch_k=10, k=3)

Asks a question for Gemini to a previously created dataset and monitors until an answer is ready.

Args: - base_url (str): The base URL of the service. - dataset_id (str): The unique identifier of the dataset. - question (str): The question string.

Returns: - dict: A dictionary containing the answer details. Typically this would include a question_id and the status of the question.

Raises: - HTTPError: If any of the HTTP requests return a non-200 status code.

Source code in dadosfera/services/autodrive.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def ask_question_to_dataset_using_ai(
        base_url: str,
        dataset_id: str,
        question: str,
        metadata_filter: Optional[Dict] = None,
        distance_metric: Literal["cos","L2","L1","max","dot"] = "cos",
        maximize_marginal_relevance: bool = True,
        fetch_k: int = 10,
        k: int = 3
    ):
    """
    Asks a question for Gemini to a previously created dataset and monitors until an answer is ready.

    Args:
    - base_url (str): The base URL of the service.
    - dataset_id (str): The unique identifier of the dataset.
    - question (str): The question string.

    Returns:
    - dict: A dictionary containing the answer details. Typically this would include a question_id and the status of the question.

    Raises:
    - HTTPError: If any of the HTTP requests return a non-200 status code.
    """
    authorization = create_basic_auth(username = 'admin', password = extract_id(base_url))

    # Header
    headers = {
        'Authorization': authorization,
    }

    response = requests.post(
        url=f"{base_url}/dataset/{dataset_id}/ai_question",
        headers={"Content-Type": "application/json", **headers},
        data=json.dumps({
            "question": question,
            "metadata_filter": metadata_filter,
            "distance_metric": distance_metric,
            "maximize_marginal_relevance": maximize_marginal_relevance,
            "fetch_k": fetch_k,
            "k": k
        })
    )

    if response.status_code != 200:
        logger.error(response.content)
        response.raise_for_status()

    question_id = response.json()["question_id"]

    terminal_states = ["success", "failed"]
    logger.info("Checking if question is ready")
    state = None
    while True:
        logger.info(f"Question {question_id} not ready yet, waiting some seconds.")
        response = requests.get(
            f"{base_url}/dataset/{dataset_id}/ai_question/{question_id}",
            headers={"Content-Type": "application/json", **headers}
            )
        if response.status_code != 200:
            response.raise_for_status()

        state = response.json()["status"]

        if state in terminal_states:
            logger.info(f"Response: {response.json()}")
            return response.json()

        sleep(5)