"""Module to handle dataset-related operations within a project."""
import os
import sys
import requests
from matrice.utils import handle_response
from datetime import datetime, timedelta
[docs]
def get_dataset_size(session, url, project_id):
"""
Fetch the size of a dataset from the specified URL.
This function sends a request to retrieve the dataset size, measured in megabytes, for a given project.
Parameters
----------
session : Session
The active session used to communicate with the API.
url : str
The URL of the dataset to fetch the size for.
project_id : str
The ID of the project associated with the dataset.
Returns
-------
tuple
A tuple containing three elements:
- dict: API response with dataset size information (e.g., size in MB).
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Example
-------
>>> size_info, err, msg = get_dataset_size(session=session, url="https://example.com/dataset.zip", project_id="12345")
>>> if err:
>>> print(f"Error: {err}")
>>> else:
>>> print(f"Dataset size: {size_info.get('size', 'N/A')} MB")
"""
path = f"/v1/dataset/get_dataset_size_in_mb_from_url?projectId={project_id}"
requested_payload = {"datasetUrl": url}
headers = {"Content-Type": "application/json"}
resp = session.rpc.post(path=path, headers=headers, payload=requested_payload)
return handle_response(
resp, f"Dataset size fetched successfully", "Could not fetch dataset size"
)
[docs]
def upload_file(session, file_path):
"""
Upload a file to the dataset. Only ZIP files are supported.
This function uploads a ZIP file to the dataset server for the specified session. It generates an upload URL,
then uses it to transfer the file.
Parameters
----------
session : Session
The active session used to communicate with the API.
file_path : str
The local path of the file to upload.
Returns
-------
dict
A dictionary containing:
- `success` (bool): Indicates if the upload was successful.
- `data` (str): URL of the uploaded file if successful, empty string otherwise.
- `message` (str): A status message indicating success or detailing any error.
Example
-------
>>> result = upload_file(session=session, file_path="path/to/data.zip")
>>> if result['success']:
>>> print(f"File uploaded successfully: {result['data']}")
>>> else:
>>> print(f"Error: {result['message']}")
"""
file_name = os.path.basename(file_path)
upload_url, error, message = _get_upload_path(session, file_name)
if error is not None:
return {"success": False, "data": "", "message": message}
with open(file_path, "rb") as file:
response = requests.put(upload_url, data=file)
if response.status_code == 200:
return {
"success": True,
"data": upload_url.split("?")[0],
"message": "File uploaded successfully",
}
else:
return {
"success": False,
"data": "",
"message": response.json().get("message", "Network Error"),
}
def _get_upload_path(session, file_name):
"""
Get the upload path for a specified file name.
This function generates an API request to retrieve the URL for uploading a specific file.
Parameters
----------
session : Session
The active session used to communicate with the API.
file_name : str
The name of the file for which the upload path is required.
Returns
-------
tuple
A tuple containing:
- dict: API response with the upload URL.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Example
-------
>>> resp, err, msg = _get_upload_path(session=session, file_name="data.zip")
>>> if err:
>>> print(f"Error: {err}")
>>> else:
>>> print(f"Upload Path: {resp.get('upload_url', 'N/A')}")
"""
path = f"/v1/dataset/upload-path?fileName={file_name}"
resp = session.rpc.get(path=path)
return handle_response(
resp, "Upload Path fetched successfully", "Could not fetch upload path"
)
[docs]
class Dataset:
"""
Class to handle dataset-related operations within a project.
This class manages operations on a dataset within a specified project. During initialization,
either `dataset_name` or `dataset_id` must be provided to locate the dataset.
Parameters
----------
session : Session
The session object that manages the connection to the server.
dataset_id : str, optional
The ID of the dataset (default is None). Used to directly locate the dataset.
dataset_name : str, optional
The name of the dataset (default is None). If `dataset_id` is not provided, `dataset_name` will be used to find the dataset.
Attributes
----------
dataset_id : str
The unique identifier for the dataset.
dataset_name : str
The name of the dataset.
version_status : str
The processing status of the latest dataset version.
latest_version : str
The identifier of the latest version of the dataset.
no_of_samples : int
The total number of samples in the dataset.
no_of_classes : int
The total number of classes in the dataset.
no_of_versions : int
The total number of versions for this dataset.
last_updated_at : str
The timestamp of the dataset's most recent update.
summary : dict
Summary of the dataset's latest version, providing metrics like item count and class distribution.
Raises
------
ValueError
If neither `dataset_id` nor `dataset_name` is provided, or if there is a mismatch between `dataset_id` and `dataset_name`.
Example
-------
>>> session = Session(account_number=account_number, access_key=access_key, secret_key=secret_key)
>>> dataset = Dataset(session=session, dataset_id="12345",dataset_name="Sample")
>>> print(f"Dataset Name: {dataset.dataset_name}")
>>> print(f"Number of Samples: {dataset.no_of_samples}")
>>> print(f"Latest Version: {dataset.latest_version}")
"""
[docs]
def __init__(self, session, dataset_id=None, dataset_name=None):
self.session = session
self.project_id = session.project_id
self.last_refresh_time = datetime.now()
self.dataset_id = dataset_id
self.dataset_name = dataset_name
self.rpc = session.rpc
assert dataset_id or dataset_name, "Either dataset_id or dataset_name must be provided"
# Fetch dataset by name only if only dataset_name is provided
if dataset_name is not None:
dataset_by_name, err, msg = self._get_dataset_by_name()
if self.dataset_id is None:
if dataset_by_name is None:
raise ValueError(f"Dataset with name '{self.dataset_name}' not found.")
self.dataset_id = dataset_by_name['_id']
# If both dataset_id and dataset_name are provided, check for mismatch
elif self.dataset_id is not None and self.dataset_name is not None:
fetched_dataset_id = dataset_by_name['_id']
if fetched_dataset_id != self.dataset_id:
raise ValueError("Provided dataset_id does not match the dataset id of the provided dataset_name.")
self.dataset_details, error, message = self._get_details()
self.dataset_id = self.dataset_details['_id']
self.dataset_name = self.dataset_details['name']
self.version_status = self.dataset_details.get("stats", [{}])[0].get("versionStatus")
self.latest_version = self.dataset_details['latestVersion']
self.no_of_samples = sum(version['versionStats']['total'] for version in self.dataset_details.get('stats', []))
self.no_of_classes = len(self.dataset_details.get('stats', [{}])[0].get('classStat', {}))
self.no_of_versions = len(self.dataset_details.get('allVersions', []))
self.last_updated_at = self.dataset_details.get('updatedAt')
self.summary , err , msg = self._get_summary(self.dataset_details['latestVersion'])
[docs]
def refresh(self):
"""
Refresh the instance by reinstantiating it with the previous values.
"""
# Check if two minutes have passed since the last refresh
if datetime.now() - self.last_refresh_time < timedelta(minutes=2):
raise Exception("Refresh can only be called after two minutes since the last refresh.")
# Capture the current state
state = self.__dict__.copy()
init_params = {
'session': self.session,
'dataset_id': self.dataset_id,
}
# Reinitialize the instance
self.__init__(**init_params)
# Update the last refresh time
self.last_refresh_time = datetime.now()
def _get_details(self):
"""
Retrieve dataset details based on the dataset ID or dataset name set during class initialization.
This method attempts to fetch the dataset details using the dataset ID, if available.
If the dataset ID is not provided, it will attempt to retrieve details by the dataset name.
If neither is available, a ValueError is raised.
Returns
-------
tuple
A tuple containing three elements:
- dict
A dictionary containing important dataset information, including:
- `_id` (str): Unique identifier for the dataset.
- `_idAction` (str): Action identifier related to the dataset.
- `_idDatasetVersion` (str): Identifier for the dataset version.
- `_idProject` (str): Project ID associated with the dataset.
- `_idUser` (str): User ID associated with the dataset.
- `allVersions` (list of str): List of all dataset versions.
- `createdAt` (str): Timestamp when the dataset was created.
- `datasetDesc` (str): Description of the dataset.
- `latestVersion` (str): Identifier of the latest dataset version.
- `name` (str): Name of the dataset.
- `stats` (list of dict): Version-specific statistics, including sample counts and splits.
- `type` (str): Type of dataset (e.g., `classification`, `detection`).
- `updatedAt` (str): Last update timestamp of the dataset.
- str or None:
Error message if an error occurred, `None` otherwise.
- str:
Status message indicating success or failure.
Raises
------
ValueError
If neither `dataset_id` nor `dataset_name` is provided.
Examples
--------
>>> dataset_details, err, msg = dataset._get_details()
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(dataset_details)
>>>
>>> # Sample output
>>> {
>>> '_id': '671636dd5cffa65a7510a52b',
>>> 'name': 'MSCOCO',
>>> 'allVersions': ['v1.0', 'v1.1'],
>>> 'latestVersion': 'v1.1',
>>> ...
>>> }
Notes
-----
- `_get_dataset()` is called if `dataset_id` is set to retrieve the dataset by its ID.
- `_get_dataset_by_name()` is used if `dataset_name` is set to fetch the dataset by its name.
"""
id = self.dataset_id
name = self.dataset_name
if id:
try:
return self._get_dataset()
except Exception as e:
print(f"Error retrieving dataset by id: {e}")
elif name:
try:
return self._get_dataset_by_name()
except Exception as e:
print(f"Error retrieving dataset by name: {e}")
else:
raise ValueError(
"At least one of 'dataset_id' or 'dataset_name' must be provided."
)
def _get_summary(self, dataset_version):
"""
Retrieve a summary for a specific dataset version.
This method provides essential metrics for a specified version of the dataset.
Only the `dataset_version` is required, as `dataset_id` and `project_id` are already set during initialization.
Parameters
----------
dataset_version : str
The version of the dataset to fetch the summary for (e.g., "v1.0").
Returns
-------
tuple
A tuple containing three elements:
- dict: Key summary details of the dataset, including:
- `categoryCount` (int): The number of unique categories in the dataset.
- `dataItemCount` (int): Total number of data items in the dataset.
- `histogram` (list of dict): Distribution of items per category, with each dictionary containing:
- `_id` (str): Unique identifier for each category.
- `count` (int): Total count of items in this category.
- `label` (str): Name of the category.
- `train` (int): Number of items in the training set.
- `val` (int): Number of items in the validation set.
- `test` (int): Number of items in the test set.
- `unassigned` (int): Number of unassigned items.
- `testDataItemCount` (int): Number of items in the test set.
- `trainDataItemCount` (int): Number of items in the training set.
- `valDataItemCount` (int): Number of items in the validation set.
- `unassignedDataItemCount` (int): Number of unassigned items.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> summary, err, msg = dataset._get_summary(dataset_version="v1.0")
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(summary)
>>>
>>> # Sample output
>>> {
>>> 'categoryCount': 2,
>>> 'dataItemCount': 2877,
>>> 'histogram': [{'_id': '671638ef0f4507663b8ca2b7', 'count': 81524, 'label': 'Window', 'train': 70643, 'val': 7302, 'test': 3579, 'unassigned': 0}],
>>> 'testDataItemCount': 120,
>>> 'trainDataItemCount': 2517,
>>> 'unassignedDataItemCount': 0,
>>> 'valDataItemCount': 240
>>> }
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"/v1/dataset/{self.dataset_id}/version/{dataset_version}/summary?projectId={self.project_id}"
resp = self.rpc.get(path=path)
return handle_response(
resp,
"Dataset summary fetched successfully",
"Could not fetch dataset summary",
)
def _get_dataset(self):
"""
Fetch dataset details using the dataset ID.
This function retrieves detailed information about the dataset by its ID. The dataset ID must be set during
initialization for this function to work.
Returns
-------
tuple
A tuple containing:
- dict: API response with detailed dataset information, including:
- `_id` (str): Unique identifier for the dataset.
- `_idAction` (str): Action identifier related to the dataset.
- `_idDatasetVersion` (str): Identifier for the dataset version.
- `_idProject` (str): Project ID associated with the dataset.
- `_idUser` (str): User ID associated with the dataset.
- `allVersions` (list of str): List of all dataset versions.
- `createdAt` (str): Timestamp when the dataset was created.
- `datasetDesc` (str): Description of the dataset.
- `latestVersion` (str): Identifier of the latest dataset version.
- `name` (str): Name of the dataset.
- `stats` (list of dict): Version-specific statistics, including sample counts and splits.
- `type` (str): Type of dataset (e.g., `classification`, `detection`).
- `updatedAt` (str): Last update timestamp of the dataset.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> resp, err, msg = dataset._get_dataset()
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(resp)
>>>
>>> # Sample output
>>> {
>>> '_id': '671636dd5cffa65a7510a52b',
>>> 'name': 'MSCOCO',
>>> 'allVersions': ['v1.0', 'v1.1'],
>>> 'latestVersion': 'v1.1',
>>> ...
>>> }
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"/v1/dataset/{self.dataset_id}?projectId={self.project_id}"
resp = self.rpc.get(path=path)
return handle_response(
resp, "Dataset fetched successfully", "Could not fetch dataset"
)
[docs]
def get_categories(self, dataset_version):
"""
Get category details for a specific dataset version.
This function retrieves the categories available in a specified version of the dataset,
including category IDs, names, and associated metadata.
Parameters
----------
dataset_version : str
The version of the dataset for which to fetch categories (e.g., "v1.0").
Returns
-------
tuple
A tuple containing:
- list of dict: Each dictionary contains dataset category details, including:
- `_id` (str): Unique identifier for the category.
- `_idDataset` (str): ID of the dataset to which this category belongs.
- `_idSuperCategory` (str): Identifier for the super-category, if applicable.
- `datasetVersion` (str): Version of the dataset for this category.
- `name` (str): Name of the category.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> categories, err, msg = dataset.get_categories(dataset_version="v1.0")
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(categories[:3])
>>>
>>> # Sample output
>>> [
>>> {'_id': '671638ef0f4507663b8ca2b7', '_idDataset': '671636dd6cffa65a7510a52b', '_idSuperCategory': '000000000000000000000000', 'datasetVersion': 'v1.0', 'name': 'Dog'},
>>> {'_id': '671638ef0f4507663b8ca2b6', '_idDataset': '671636dd6cffa65a7510a52b', '_idSuperCategory': '000000000000000000000000', 'datasetVersion': 'v1.0', 'name': 'Cat'},
>>> ...
>>> ]
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"/v1/dataset/{self.dataset_id}/version/{dataset_version}/categories?projectId={self.project_id}"
resp = self.rpc.get(path=path)
return handle_response(
resp,
f"Dataset categories for version - {dataset_version} fetched successfully",
"Could not fetch dataset categories",
)
def _list_items_V2(self, dataset_version, page_size = 10, page_number = 0):
"""
List items for a specific version of the dataset.
This function retrieves a paginated list of items for the specified dataset version,
allowing control over the number of items per page and the page number.
Parameters
----------
dataset_version : str
The version of the dataset to retrieve items from (e.g., "v1.0").
page_size : int, optional
The number of items to return per page (default is 10).
page_number : int, optional
The page number to retrieve (default is 0).
Returns
-------
tuple
A tuple containing:
- dict: API response with a list of dataset items, where each item contains:
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> items, err, msg = dataset.list_items(dataset_version="v1.0", page_size=10, page_number=0)
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(items)
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"v1/dataset/{self.dataset_id}/version/{dataset_version}/v2/item?Size={page_size}&pageNumber={page_number}&projectId={self.project_id}"
resp = self.rpc.get(path=path)
return handle_response(
resp,
f"Dataset items for version - {dataset_version} fetched successfully",
"Could not fetch dataset items",
)
[docs]
def list_items(self, dataset_version, page_size = 10, page_number = 0):
"""
List items for a specific version of the dataset.
This function retrieves a paginated list of items for the specified dataset version,
allowing control over the number of items per page and the page number.
Parameters
----------
dataset_version : str
The version of the dataset to retrieve items from (e.g., "v1.0").
page_size : int, optional
The number of items to return per page (default is 10).
page_number : int, optional
The page number to retrieve (default is 0).
Returns
-------
tuple
A tuple containing:
- dict: API response with a list of dataset items, where each item contains:
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> items, err, msg = dataset.list_items(dataset_version="v1.0", page_size=10, page_number=0)
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(items)
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"v1/dataset/{self.dataset_id}/version/{dataset_version}/v2/item?Size={page_size}&pageNumber={page_number}&projectId={self.project_id}"
resp = self.rpc.get(path=path)
return handle_response(
resp,
f"Dataset items for version - {dataset_version} fetched successfully",
"Could not fetch dataset items",
)
[docs]
def get_processed_versions(self):
"""
Get all processed versions of the dataset.
This function retrieves a list of all versions of the dataset that have completed processing.
Returns
-------
tuple
A tuple containing:
- list of dict: Each dictionary contains processed dataset version details, including:
- `_id` (str): Unique identifier for the dataset.
- `_idProject` (str): Project ID associated with the dataset.
- `allVersions` (list of str): List of all versions of the dataset.
- `createdAt` (str): Timestamp of when the dataset was created.
- `latestVersion` (str): Identifier of the latest version of the dataset.
- `name` (str): Name of the dataset.
- `processedVersions` (list of str): List of processed versions.
- `stats` (list of dict): Version-specific statistics, including:
- `classStat` (dict): Contains category-specific counts for `test`, `train`, `unassigned`, and `val`.
- `version` (str): Version identifier.
- `versionDescription` (str): Description of the version.
- `versionStats` (dict): Overall statistics, including `total`, `train`, `test`, and `val` counts.
- `versionStatus` (str): Status of the version, usually "processed".
- `updatedAt` (str): Timestamp of the last dataset update.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> processed_versions, err, msg = dataset.get_processed_versions()
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(processed_versions[:3])
>>>
>>> # Sample output
>>> [
>>> {'_id': '6703af894ddeac5b596b267b', '_idProject': '67036673ccb244bee86d1939', 'allVersions': ['v1.0', 'v1.1'], 'createdAt': '2024-10-07T09:53:13.223Z', 'name': 'Microcontroller', 'processedVersions': ['v1.1'], 'latestVersion': 'v1.1', ...},
>>> ...
>>> ]
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"/v1/dataset/get_processed_versions?projectId={self.project_id}"
resp = self.rpc.get(path=path)
return handle_response(
resp,
f"Processed versions fetched successfully",
"Could not fetch processed versions",
)
[docs]
def check_valid_spilts(self, dataset_version):
"""
Check if the specified dataset version contains valid splits.
Valid splits include training, validation, and test sets. This function verifies that the
specified dataset version has these splits properly configured.
Parameters
----------
dataset_version : str
The version of the dataset to check for valid splits (e.g., "v1.0").
Returns
-------
tuple
A tuple containing:
- dict: API response indicating split validity, which includes:
- `isValid` (str): Indicates if the splits are valid.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> split_status, err, msg = dataset.check_valid_splits(dataset_version="v1.0")
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(split_status)
>>>
>>> # Sample output
>>> 'Valid Spilts'
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"/v1/dataset/check_valid_spilts/{self.dataset_id}/{dataset_version}?projectId={self.project_id}"
resp = self.rpc.get(path=path)
return handle_response(resp, f"Splits are valid", "Splits are invalid")
def _get_dataset_by_name(self):
"""
Fetch dataset details using the dataset name.
This function retrieves detailed information about the dataset by its name. The dataset name
must be provided during initialization for this function to work.
Returns
-------
tuple
A tuple containing:
- dict: API response with dataset details, including:
- `_id` (str): Unique identifier for the dataset.
- `_idProject` (str): Project ID associated with the dataset.
- `name` (str): Name of the dataset.
- `type` (str): Type of dataset (e.g., `classification`, `detection`).
- `createdAt` (str): Timestamp of when the dataset was created.
- `updatedAt` (str): Last update timestamp of the dataset.
- `latestVersion` (str): Identifier of the latest dataset version.
- `allVersions` (list of str): List of all versions available for the dataset.
- `description` (str): Brief description of the dataset.
- `stats` (list of dict): Version-specific statistics and counts.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If `dataset_name` is not set.
Example
-------
>>> dataset_details, err, msg = dataset._get_dataset_by_name()
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(dataset_details)
>>>
>>> # Sample output
>>> {
>>> '_id': '671636dd5cffa65a7510a52b',
>>> 'name': 'Sample Dataset',
>>> 'latestVersion': 'v1.2',
>>> 'allVersions': ['v1.0', 'v1.1', 'v1.2'],
>>> ...
>>> }
"""
if self.dataset_name == "":
print(
"Dataset name not set for this dataset. Cannot perform the operation for dataset without dataset name"
)
sys.exit(0)
path = f"/v1/dataset/get_dataset_by_name?datasetName={self.dataset_name}&projectId={self.project_id}"
resp = self.rpc.get(path=path)
return handle_response(
resp,
f"Dataset Details Fetched successfully",
"Could not fetch dataset details",
)
# PUT REQUESTS
[docs]
def rename(self, updated_name):
"""
Update the name of the dataset.
This function updates the dataset name to a specified value. The dataset ID must
be set during initialization for this function to work.
Parameters
----------
updated_name : str
The new name for the dataset.
Returns
-------
tuple
A tuple containing:
- dict: API response confirming the dataset name update, including:
- `MatchedCount` (int): Number of records matched for the update.
- `ModifiedCount` (int): Number of records modified.
- `UpsertedCount` (int): Number of records upserted (inserted if not existing).
- `UpsertedID` (str or None): ID of the upserted record if applicable, otherwise `None`.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> response, err, msg = dataset.rename(updated_name="Updated Dataset Name")
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(response)
>>>
>>> # Sample output
>>> {
>>> 'MatchedCount': 1,
>>> 'ModifiedCount': 1,
>>> 'UpsertedCount': 0,
>>> 'UpsertedID': None
>>> }
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"/v1/dataset/{self.dataset_id}?projectId={self.project_id}"
headers = {"Content-Type": "application/json"}
body = {"name": updated_name}
resp = self.rpc.put(path=path, headers=headers, payload=body)
return handle_response(
resp,
f"Successfully updated dataset name to {updated_name}",
"Could not update datename",
)
[docs]
def update_item_label(self, dataset_version, item_id, label_id):
"""
Update the label of a specific dataset item.
This function assigns a new label to a specific item in a specified dataset version.
The dataset ID must be set during initialization for this function to work.
Parameters
----------
dataset_version : str
The version of the dataset where the item resides (e.g., "v1.0").
item_id : str
The unique identifier of the dataset item to update.
label_id : str
The unique identifier of the new label to assign to the dataset item.
Returns
-------
tuple
A tuple containing:
- dict: API response confirming the label update.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> response, err, msg = dataset.update_item_label(dataset_version="v1.0", item_id="12345", label_id="67890")
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(response)
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"/v1/dataset/{self.dataset_id}/version/{dataset_version}/item/{item_id}/label?projectId={self.project_id}"
headers = {"Content-Type": "application/json"}
body = {"labelId": label_id}
resp = self.rpc.put(path=path, headers=headers, payload=body)
return handle_response(
resp,
"Update data item label in progress",
"Could not update the date item label",
)
# POST REQUESTS
[docs]
def add_data(
self,
source,
source_url,
new_dataset_version,
old_dataset_version,
dataset_description="",
version_description="",
compute_alias="",
):
"""
Import a new version of the dataset from an external source. Only ZIP files are supported for upload.
This function creates a new dataset version or updates an existing version with data from a specified
external source URL. The dataset ID must be set during initialization for this function to work.
Parameters
----------
source : str
The source of the dataset, indicating where the dataset originates (e.g., "url").
source_url : str
The URL of the dataset to be imported.
new_dataset_version : str
The version identifier for the new dataset (e.g., "v2.0").
old_dataset_version : str
The version identifier of the existing dataset to be updated.
dataset_description : str, optional
Description of the dataset (default is an empty string).
version_description : str, optional
Description for the new dataset version (default is an empty string).
compute_alias : str, optional
Alias for the compute instance to be used (default is an empty string).
Returns
-------
tuple
A tuple containing:
- dict: API response indicating the status of the dataset import.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set or if the old dataset version is incomplete.
Example
-------
>>> response, err, msg = dataset.add_data(
>>> source="url",
>>> source_url="https://example.com/dataset.zip",
>>> new_dataset_version="v2.0",
>>> old_dataset_version="v1.0"
>>> )
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(response)
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
dataset_resp, err, message = self._get_dataset()
if err is not None:
return dataset_resp, err, message
stats = dataset_resp["stats"]
if dataset_description == "":
dataset_description = dataset_resp["datasetDesc"]
for stat in stats:
if stat["version"] != old_dataset_version:
continue
if stat["versionStatus"] != "processed":
resp = {}
err = None
message = f"Only the dataset versions with complete status can be updated.Version {old_dataset_version} of the dataset doesn't have status complete."
return resp, err, message
if version_description == "" and old_dataset_version == new_dataset_version:
version_description = stat["versionDescription"]
break
is_created_new = new_dataset_version == old_dataset_version
path = f"v1/dataset/{self.dataset_id}/import?project={self.project_id}"
headers = {"Content-Type": "application/json"}
body = {
"source": source,
"sourceUrl": source_url,
"isCreateNew": is_created_new,
"isUnlabeled": False,
"newDatasetVersion": new_dataset_version,
"oldDatasetVersion": old_dataset_version,
"newVersionDescription": version_description,
"datasetDesc": dataset_description,
"computeAlias": compute_alias,
}
resp = self.rpc.post(path=path, headers=headers, payload=body)
return handle_response(
resp,
"New data item addition in progress",
"An error occured while trying to add new data item.",
)
[docs]
def split_data(
self,
old_dataset_version,
new_dataset_version,
is_random_split,
train_num=0,
val_num=0,
test_num=0,
transfers=[{"source": "", "destination": "", "transferAmount": 1}],
dataset_description="",
version_description="",
new_version_description="",
compute_alias="",
):
"""
Split or transfer images between training, validation, and test sets in the dataset.
This function enables the creation of a new dataset version by transferring or splitting images from an existing
version into training, validation, and test sets, with options for random or manual split distribution.
Parameters
----------
old_dataset_version : str
The version identifier of the existing dataset.
new_dataset_version : str
The version identifier of the new dataset.
is_random_split : bool
Indicates whether to perform a random split.
train_num : int, optional
Number of training samples (default is 0).
val_num : int, optional
Number of validation samples (default is 0).
test_num : int, optional
Number of test samples (default is 0).
transfers : list of dict, optional
List specifying transfers between dataset sets. Each dictionary should contain:
- `source` (str): The source set (e.g., "train").
- `destination` (str): The target set (e.g., "test").
- `transferAmount` (int): Number of items to transfer (default is 1).
dataset_description : str, optional
Description of the dataset (default is an empty string).
version_description : str, optional
Description of the dataset version (default is an empty string).
new_version_description : str, optional
Description of the new dataset version (default is an empty string).
compute_alias : str, optional
Alias for the compute instance (default is an empty string).
Returns
-------
tuple
A tuple containing:
- dict: API response indicating the status of the dataset split or transfer.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set or if the `old_dataset_version` is not processed.
Example
-------
>>> response, err, msg = dataset.split_data(
>>> old_dataset_version="v1.0",
>>> new_dataset_version="v2.0",
>>> is_random_split=True,
>>> train_num=100,
>>> val_num=20,
>>> test_num=30,
>>> transfers=[{"source": "train", "destination": "test", "transferAmount": 100}]
>>> )
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(response)
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
dataset_resp, err, message = self._get_dataset()
if err is not None:
return dataset_resp, err, message
stats = dataset_resp["stats"]
if dataset_description == "":
dataset_description = dataset_resp["datasetDesc"]
for stat in stats:
if stat["version"] != old_dataset_version:
continue
if stat["versionStatus"] != "processed":
resp = {}
err = None
message = f"Only the dataset versions with complete status can be updated.Version {old_dataset_version} of the dataset doesn't have status complete."
return resp, err, message
if version_description == "" and old_dataset_version == new_dataset_version:
version_description = stat["versionDescription"]
break
path = f"/v1/dataset/{self.dataset_id}/split_data?projectId={self.project_id}"
headers = {"Content-Type": "application/json"}
body = {
"trainNum": train_num,
"testNum": test_num,
"valNum": val_num,
"unassignedNum": 0,
"oldDatasetVersion": old_dataset_version,
"newDatasetVersion": new_dataset_version,
"isRandomSplit": is_random_split,
"datasetDesc": dataset_description,
"newVersionDescription": new_version_description,
"transfers": transfers,
"computeAlias": compute_alias,
}
resp = self.rpc.post(path=path, headers=headers, payload=body)
return handle_response(
resp,
"Dataset spliting in progress",
"An error occured while trying to split the data.",
)
# DELETE REQUESTS
[docs]
def delete_item(self, dataset_version, dataset_item_ids):
"""
Delete items from a specific version of the dataset based on dataset type.
This function deletes items from a specified version of the dataset. The deletion method is selected
automatically based on the dataset type (e.g., classification, detection). The dataset ID must be set
during initialization for this function to work.
Parameters
----------
dataset_version : str
The version of the dataset from which to delete items.
dataset_item_ids : list of str
A list of dataset item IDs to delete.
Returns
-------
tuple
A tuple containing:
- dict: API response indicating the deletion status.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
ValueError
If the dataset type is unsupported.
Example
-------
>>> response, err, msg = dataset.delete_item(
>>> dataset_version="v1.0", dataset_item_ids=["123", "456"]
>>> )
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(response)
"""
# Retrieve the dataset details to get the type
resp, error, message = self._get_details()
if error:
return resp, error, message
dataset_type = resp.get('type')
# Check dataset type and call the respective delete function
if dataset_type == "classification":
return self._delete_item_classification(dataset_version, dataset_item_ids)
elif dataset_type == "detection":
return self._delete_item_detection(dataset_version, dataset_item_ids)
else:
return {}, f"Unsupported dataset type: {dataset_type}.", "Failed to delete dataset items"
def _delete_item_classification(self, dataset_version, dataset_item_ids):
"""
Delete items from a classification dataset version.
This function deletes specific items from a given version of a classification dataset.
The dataset ID must be set during initialization for this function to work.
Parameters
----------
dataset_version : str
The version of the classification dataset from which to delete items.
dataset_item_ids : list of str
A list of dataset item IDs to delete.
Returns
-------
tuple
A tuple containing:
- dict: API response confirming the deletion status.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> response, err, msg = dataset._delete_item_classification(
>>> dataset_version="v1.0", dataset_item_ids=["123", "456"]
>>> )
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(response)
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"/v1/dataset/version/{dataset_version}/dataset_item_classification?projectId={self.project_id}&datasetId={self.dataset_id}"
requested_payload = {"datasetItemIds": dataset_item_ids}
headers = {"Content-Type": "application/json"}
resp = self.rpc.delete(path=path, headers=headers, payload=requested_payload)
return handle_response(
resp,
f"Given dataset items deleted successfully",
"Could not delete the given dataset items",
)
def _delete_item_detection(self, dataset_version, dataset_item_ids):
"""
Delete items from a detection dataset version.
This function deletes specified items from a given version of a detection dataset.
The dataset ID must be set during initialization for this function to work.
Parameters
----------
dataset_version : str
The version of the detection dataset from which to delete items.
dataset_item_ids : list of str
A list of dataset item IDs to delete.
Returns
-------
tuple
A tuple containing:
- dict: API response confirming the deletion status.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> response, err, msg = dataset._delete_item_detection(
>>> dataset_version="v1.0", dataset_item_ids=["123", "456"]
>>> )
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(response)
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"/v1/dataset/version/{dataset_version}/dataset_item_detection?projectId={self.project_id}&datasetId={self.dataset_id}"
requested_payload = {"datasetItemIds": dataset_item_ids}
headers = {"Content-Type": "application/json"}
resp = self.rpc.delete(path=path, headers=headers, payload=requested_payload)
return handle_response(
resp,
f"Given dataset items deleted successfully",
"Could not delete the given dataset items",
)
[docs]
def delete_version(self, dataset_version):
"""
Delete a specific version of the dataset.
This function removes a specified version of the dataset. The dataset ID must be set
during initialization for this function to work.
Parameters
----------
dataset_version : str
The version identifier of the dataset to delete (e.g., "v1.0").
Returns
-------
tuple
A tuple containing:
- dict: API response confirming the deletion status.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> response, err, msg = dataset.delete_version(dataset_version="v1.0")
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(response)
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"/v1/dataset/{self.dataset_id}/version/{dataset_version}?projectId={self.project_id}"
resp = self.rpc.delete(path=path)
return handle_response(
resp,
f"Successfully deleted version - {dataset_version}",
"Could not delete the said version",
)
[docs]
def delete(self):
"""
Delete the entire dataset.
This function deletes the entire dataset associated with the given dataset ID. The dataset ID
must be set during initialization for this function to work.
Returns
-------
tuple
A tuple containing:
- dict: API response confirming the dataset deletion status.
- str or None: Error message if an error occurred, `None` otherwise.
- str: Status message indicating success or failure.
Raises
------
SystemExit
If the `dataset_id` is not set.
Example
-------
>>> response, err, msg = dataset.delete()
>>> if err:
>>> pprint(err)
>>> else:
>>> pprint(response)
"""
if self.dataset_id is None:
print(
"Dataset id not set for this dataset. Cannot perform the operation for dataset without dataset id"
)
sys.exit(0)
path = f"/v1/dataset/{self.dataset_id}?projectId={self.project_id}"
resp = self.rpc.delete(path=path)
return handle_response(
resp, f"Successfully deleted the dataset", "Could not delete the dataset"
)