collate_fn(batch)

Convert a batch of numpy arrays to tensors (with appropriate padding).

Parameters:
  • batch (Dict[str, np.ndarray]) –

    input batch as a dictionary of numpy arrays.

Returns:
  • Dict[str, torch.Tensor]

    Dict[str, torch.Tensor]: output batch as a dictionary of tensors.

madewithml/utils.py
def collate_fn(batch: Dict[str, np.ndarray]) -> Dict[str, torch.Tensor]:  # pragma: no cover, air internal
    """Convert a batch of numpy arrays to tensors (with appropriate padding).

    Args:
        batch (Dict[str, np.ndarray]): input batch as a dictionary of numpy arrays.

    Returns:
        Dict[str, torch.Tensor]: output batch as a dictionary of tensors.
    """
    batch["ids"] = pad_array(batch["ids"])
    batch["masks"] = pad_array(batch["masks"])
    dtypes = {"ids": torch.int32, "masks": torch.int32, "targets": torch.int64}
    tensor_batch = {}
    for key, array in batch.items():
        tensor_batch[key] = torch.as_tensor(array, dtype=dtypes[key], device=get_device())
    return tensor_batch

dict_to_list(data, keys)

Convert a dictionary to a list of dictionaries.

Parameters:
  • data (Dict) –

    input dictionary.

  • keys (List[str]) –

    keys to include in the output list of dictionaries.

Returns:
  • List[Dict[str, Any]]

    List[Dict[str, Any]]: output list of dictionaries.

madewithml/utils.py
def dict_to_list(data: Dict, keys: List[str]) -> List[Dict[str, Any]]:
    """Convert a dictionary to a list of dictionaries.

    Args:
        data (Dict): input dictionary.
        keys (List[str]): keys to include in the output list of dictionaries.

    Returns:
        List[Dict[str, Any]]: output list of dictionaries.
    """
    list_of_dicts = []
    for i in range(len(data[keys[0]])):
        new_dict = {key: data[key][i] for key in keys}
        list_of_dicts.append(new_dict)
    return list_of_dicts

get_col(ds, col)

Return an array of values from a specific array column in a Ray Dataset.

Parameters:
  • ds (Dataset) –

    Ray Dataset.

  • col (str) –

    name of the column to extract values from.

Returns:
  • np.ndarray

    np.array: an array of the column's values.

madewithml/utils.py
def get_col(ds: Dataset, col: str) -> np.ndarray:
    """Return an array of values from a specific array column in a Ray Dataset.

    Args:
        ds (Dataset): Ray Dataset.
        col (str): name of the column to extract values from.

    Returns:
        np.array: an array of the column's values.
    """
    values = ds.select_columns(cols=[col]).take_all()
    return np.stack([item[col] for item in values])

get_run_id(experiment_name, trial_id)

Get the MLflow run ID for a specific Ray trial ID.

Parameters:
  • experiment_name (str) –

    name of the experiment.

  • trial_id (str) –

    id of the trial.

Returns:
  • str( str ) –

    run id of the trial.

madewithml/utils.py
def get_run_id(experiment_name: str, trial_id: str) -> str:  # pragma: no cover, mlflow functionality
    """Get the MLflow run ID for a specific Ray trial ID.

    Args:
        experiment_name (str): name of the experiment.
        trial_id (str): id of the trial.

    Returns:
        str: run id of the trial.
    """
    trial_name = f"TorchTrainer_{trial_id}"
    run = mlflow.search_runs(experiment_names=[experiment_name], filter_string=f"tags.trial_name = '{trial_name}'").iloc[0]
    return run.run_id

load_dict(path)

Load a dictionary from a JSON's filepath.

Parameters:
  • path (str) –

    location of file.

Returns:
  • Dict( Dict ) –

    loaded JSON data.

madewithml/utils.py
def load_dict(path: str) -> Dict:
    """Load a dictionary from a JSON's filepath.

    Args:
        path (str): location of file.

    Returns:
        Dict: loaded JSON data.
    """
    with open(path) as fp:
        d = json.load(fp)
    return d

pad_array(arr, dtype=np.int32)

Pad an 2D array with zeros until all rows in the 2D array are of the same length as a the longest row in the 2D array.

Parameters:
  • arr (np.array) –

    input array

Returns:
  • np.ndarray

    np.array: zero padded array

madewithml/utils.py
def pad_array(arr: np.ndarray, dtype=np.int32) -> np.ndarray:
    """Pad an 2D array with zeros until all rows in the
    2D array are of the same length as a the longest
    row in the 2D array.

    Args:
        arr (np.array): input array

    Returns:
        np.array: zero padded array
    """
    max_len = max(len(row) for row in arr)
    padded_arr = np.zeros((arr.shape[0], max_len), dtype=dtype)
    for i, row in enumerate(arr):
        padded_arr[i][: len(row)] = row
    return padded_arr

save_dict(d, path, cls=None, sortkeys=False)

Save a dictionary to a specific location.

Parameters:
  • d (Dict) –

    data to save.

  • path (str) –

    location of where to save the data.

  • cls (optional) –

    encoder to use on dict data. Defaults to None.

  • sortkeys (bool) –

    whether to sort keys alphabetically. Defaults to False.

madewithml/utils.py
def save_dict(d: Dict, path: str, cls: Any = None, sortkeys: bool = False) -> None:
    """Save a dictionary to a specific location.

    Args:
        d (Dict): data to save.
        path (str): location of where to save the data.
        cls (optional): encoder to use on dict data. Defaults to None.
        sortkeys (bool, optional): whether to sort keys alphabetically. Defaults to False.
    """
    directory = os.path.dirname(path)
    if directory and not os.path.exists(directory):  # pragma: no cover
        os.makedirs(directory)
    with open(path, "w") as fp:
        json.dump(d, indent=2, fp=fp, cls=cls, sort_keys=sortkeys)
        fp.write("\n")

set_seeds(seed=42)

Set seeds for reproducibility.

madewithml/utils.py
def set_seeds(seed: int = 42):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    eval("setattr(torch.backends.cudnn, 'deterministic', True)")
    eval("setattr(torch.backends.cudnn, 'benchmark', False)")
    os.environ["PYTHONHASHSEED"] = str(seed)