CustomPreprocessor

Bases: Preprocessor

Custom preprocessor class.

Source code in madewithml/data.py
class CustomPreprocessor(Preprocessor):
    """Custom preprocessor class."""

    def _fit(self, ds):
        tags = ds.unique(column="tag")
        self.class_to_index = {tag: i for i, tag in enumerate(tags)}
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}

    def _transform_pandas(self, batch):  # could also do _transform_numpy
        return preprocess(batch, class_to_index=self.class_to_index)

clean_text(text, stopwords=STOPWORDS)

Clean raw text string.

Parameters:
  • text (str) –

    Raw text to clean.

  • stopwords (List) –

    description. Defaults to STOPWORDS.

Returns:
  • str( str ) –

    description

madewithml/data.py
def clean_text(text: str, stopwords: List = STOPWORDS) -> str:
    """Clean raw text string.

    Args:
        text (str): Raw text to clean.
        stopwords (List, optional): _description_. Defaults to STOPWORDS.

    Returns:
        str: _description_
    """
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub(" ", text)

    # Spacing and filters
    text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text)  # add spacing
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()  # strip white space at the ends
    text = re.sub(r"http\S+", "", text)  # remove links

    return text

load_data(dataset_loc, num_samples=None)

Load data from source into a Ray Dataset.

Parameters:
  • dataset_loc (str) –

    Location of the dataset.

  • num_samples (int) –

    The number of samples to load. Defaults to None.

Returns:
  • Dataset( Dataset ) –

    Our dataset represented by a Ray Dataset.

madewithml/data.py
def load_data(dataset_loc: str, num_samples: int = None) -> Dataset:
    """Load data from source into a Ray Dataset.

    Args:
        dataset_loc (str): Location of the dataset.
        num_samples (int, optional): The number of samples to load. Defaults to None.

    Returns:
        Dataset: Our dataset represented by a Ray Dataset.
    """
    ds = ray.data.read_csv(dataset_loc)
    ds = ds.random_shuffle(seed=1234)
    ds = ray.data.from_items(ds.take(num_samples)) if num_samples else ds
    return ds

preprocess(df, class_to_index)

Preprocess the data in our dataframe.

Parameters:
  • df (pd.DataFrame) –

    Raw dataframe to preprocess.

  • class_to_index (Dict) –

    Mapping of class names to indices.

Returns:
  • Dict( Dict ) –

    preprocessed data (ids, masks, targets).

madewithml/data.py
def preprocess(df: pd.DataFrame, class_to_index: Dict) -> Dict:
    """Preprocess the data in our dataframe.

    Args:
        df (pd.DataFrame): Raw dataframe to preprocess.
        class_to_index (Dict): Mapping of class names to indices.

    Returns:
        Dict: preprocessed data (ids, masks, targets).
    """
    df["text"] = df.title + " " + df.description  # feature engineering
    df["text"] = df.text.apply(clean_text)  # clean text
    df = df.drop(columns=["id", "created_on", "title", "description"], errors="ignore")  # clean dataframe
    df = df[["text", "tag"]]  # rearrange columns
    df["tag"] = df["tag"].map(class_to_index)  # label encoding
    outputs = tokenize(df)
    return outputs

stratify_split(ds, stratify, test_size, shuffle=True, seed=1234)

Split a dataset into train and test splits with equal amounts of data points from each class in the column we want to stratify on.

Parameters:
  • ds (Dataset) –

    Input dataset to split.

  • stratify (str) –

    Name of column to split on.

  • test_size (float) –

    Proportion of dataset to split for test set.

  • shuffle (bool) –

    whether to shuffle the dataset. Defaults to True.

  • seed (int) –

    seed for shuffling. Defaults to 1234.

Returns:
  • Tuple[Dataset, Dataset]

    Tuple[Dataset, Dataset]: the stratified train and test datasets.

madewithml/data.py
def stratify_split(
    ds: Dataset,
    stratify: str,
    test_size: float,
    shuffle: bool = True,
    seed: int = 1234,
) -> Tuple[Dataset, Dataset]:
    """Split a dataset into train and test splits with equal
    amounts of data points from each class in the column we
    want to stratify on.

    Args:
        ds (Dataset): Input dataset to split.
        stratify (str): Name of column to split on.
        test_size (float): Proportion of dataset to split for test set.
        shuffle (bool, optional): whether to shuffle the dataset. Defaults to True.
        seed (int, optional): seed for shuffling. Defaults to 1234.

    Returns:
        Tuple[Dataset, Dataset]: the stratified train and test datasets.
    """

    def _add_split(df: pd.DataFrame) -> pd.DataFrame:  # pragma: no cover, used in parent function
        """Naively split a dataframe into train and test splits.
        Add a column specifying whether it's the train or test split."""
        train, test = train_test_split(df, test_size=test_size, shuffle=shuffle, random_state=seed)
        train["_split"] = "train"
        test["_split"] = "test"
        return pd.concat([train, test])

    def _filter_split(df: pd.DataFrame, split: str) -> pd.DataFrame:  # pragma: no cover, used in parent function
        """Filter by data points that match the split column's value
        and return the dataframe with the _split column dropped."""
        return df[df["_split"] == split].drop("_split", axis=1)

    # Train, test split with stratify
    grouped = ds.groupby(stratify).map_groups(_add_split, batch_format="pandas")  # group by each unique value in the column we want to stratify on
    train_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "train"}, batch_format="pandas")  # combine
    test_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "test"}, batch_format="pandas")  # combine

    # Shuffle each split (required)
    train_ds = train_ds.random_shuffle(seed=seed)
    test_ds = test_ds.random_shuffle(seed=seed)

    return train_ds, test_ds

tokenize(batch)

Tokenize the text input in our batch using a tokenizer.

Parameters:
  • batch (Dict) –

    batch of data with the text inputs to tokenize.

Returns:
  • Dict( Dict ) –

    batch of data with the results of tokenization (input_ids and attention_mask) on the text inputs.

madewithml/data.py
def tokenize(batch: Dict) -> Dict:
    """Tokenize the text input in our batch using a tokenizer.

    Args:
        batch (Dict): batch of data with the text inputs to tokenize.

    Returns:
        Dict: batch of data with the results of tokenization (`input_ids` and `attention_mask`) on the text inputs.
    """
    tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
    encoded_inputs = tokenizer(batch["text"].tolist(), return_tensors="np", padding="longest")
    return dict(ids=encoded_inputs["input_ids"], masks=encoded_inputs["attention_mask"], targets=np.array(batch["tag"]))