collate_fn(batch)
Convert a batch of numpy arrays to tensors (with appropriate padding).
Parameters: |
|
---|
Returns: |
|
---|
madewithml/utils.py
def collate_fn(batch: Dict[str, np.ndarray]) -> Dict[str, torch.Tensor]: # pragma: no cover, air internal
"""Convert a batch of numpy arrays to tensors (with appropriate padding).
Args:
batch (Dict[str, np.ndarray]): input batch as a dictionary of numpy arrays.
Returns:
Dict[str, torch.Tensor]: output batch as a dictionary of tensors.
"""
batch["ids"] = pad_array(batch["ids"])
batch["masks"] = pad_array(batch["masks"])
dtypes = {"ids": torch.int32, "masks": torch.int32, "targets": torch.int64}
tensor_batch = {}
for key, array in batch.items():
tensor_batch[key] = torch.as_tensor(array, dtype=dtypes[key], device=get_device())
return tensor_batch
dict_to_list(data, keys)
Convert a dictionary to a list of dictionaries.
Parameters: |
|
---|
Returns: |
|
---|
madewithml/utils.py
def dict_to_list(data: Dict, keys: List[str]) -> List[Dict[str, Any]]:
"""Convert a dictionary to a list of dictionaries.
Args:
data (Dict): input dictionary.
keys (List[str]): keys to include in the output list of dictionaries.
Returns:
List[Dict[str, Any]]: output list of dictionaries.
"""
list_of_dicts = []
for i in range(len(data[keys[0]])):
new_dict = {key: data[key][i] for key in keys}
list_of_dicts.append(new_dict)
return list_of_dicts
get_col(ds, col)
Return an array of values from a specific array column in a Ray Dataset.
Parameters: |
|
---|
Returns: |
|
---|
madewithml/utils.py
def get_col(ds: Dataset, col: str) -> np.ndarray:
"""Return an array of values from a specific array column in a Ray Dataset.
Args:
ds (Dataset): Ray Dataset.
col (str): name of the column to extract values from.
Returns:
np.array: an array of the column's values.
"""
values = ds.select_columns(cols=[col]).take_all()
return np.stack([item[col] for item in values])
get_run_id(experiment_name, trial_id)
Get the MLflow run ID for a specific Ray trial ID.
Parameters: |
|
---|
Returns: |
|
---|
madewithml/utils.py
def get_run_id(experiment_name: str, trial_id: str) -> str: # pragma: no cover, mlflow functionality
"""Get the MLflow run ID for a specific Ray trial ID.
Args:
experiment_name (str): name of the experiment.
trial_id (str): id of the trial.
Returns:
str: run id of the trial.
"""
trial_name = f"TorchTrainer_{trial_id}"
run = mlflow.search_runs(experiment_names=[experiment_name], filter_string=f"tags.trial_name = '{trial_name}'").iloc[0]
return run.run_id
load_dict(path)
Load a dictionary from a JSON's filepath.
Parameters: |
|
---|
Returns: |
|
---|
madewithml/utils.py
def load_dict(path: str) -> Dict:
"""Load a dictionary from a JSON's filepath.
Args:
path (str): location of file.
Returns:
Dict: loaded JSON data.
"""
with open(path) as fp:
d = json.load(fp)
return d
pad_array(arr, dtype=np.int32)
Pad an 2D array with zeros until all rows in the 2D array are of the same length as a the longest row in the 2D array.
Parameters: |
|
---|
Returns: |
|
---|
madewithml/utils.py
def pad_array(arr: np.ndarray, dtype=np.int32) -> np.ndarray:
"""Pad an 2D array with zeros until all rows in the
2D array are of the same length as a the longest
row in the 2D array.
Args:
arr (np.array): input array
Returns:
np.array: zero padded array
"""
max_len = max(len(row) for row in arr)
padded_arr = np.zeros((arr.shape[0], max_len), dtype=dtype)
for i, row in enumerate(arr):
padded_arr[i][: len(row)] = row
return padded_arr
save_dict(d, path, cls=None, sortkeys=False)
Save a dictionary to a specific location.
Parameters: |
|
---|
madewithml/utils.py
def save_dict(d: Dict, path: str, cls: Any = None, sortkeys: bool = False) -> None:
"""Save a dictionary to a specific location.
Args:
d (Dict): data to save.
path (str): location of where to save the data.
cls (optional): encoder to use on dict data. Defaults to None.
sortkeys (bool, optional): whether to sort keys alphabetically. Defaults to False.
"""
directory = os.path.dirname(path)
if directory and not os.path.exists(directory): # pragma: no cover
os.makedirs(directory)
with open(path, "w") as fp:
json.dump(d, indent=2, fp=fp, cls=cls, sort_keys=sortkeys)
fp.write("\n")
set_seeds(seed=42)
Set seeds for reproducibility.
madewithml/utils.py
def set_seeds(seed: int = 42):
"""Set seeds for reproducibility."""
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
eval("setattr(torch.backends.cudnn, 'deterministic', True)")
eval("setattr(torch.backends.cudnn, 'benchmark', False)")
os.environ["PYTHONHASHSEED"] = str(seed)