CustomPreprocessor
Bases: Preprocessor
Custom preprocessor class.
Source code in madewithml/data.py
class CustomPreprocessor(Preprocessor):
"""Custom preprocessor class."""
def _fit(self, ds):
tags = ds.unique(column="tag")
self.class_to_index = {tag: i for i, tag in enumerate(tags)}
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
def _transform_pandas(self, batch): # could also do _transform_numpy
return preprocess(batch, class_to_index=self.class_to_index)
clean_text(text, stopwords=STOPWORDS)
Clean raw text string.
Parameters: |
|
---|
Returns: |
|
---|
madewithml/data.py
def clean_text(text: str, stopwords: List = STOPWORDS) -> str:
"""Clean raw text string.
Args:
text (str): Raw text to clean.
stopwords (List, optional): _description_. Defaults to STOPWORDS.
Returns:
str: _description_
"""
# Lower
text = text.lower()
# Remove stopwords
pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
text = pattern.sub(" ", text)
# Spacing and filters
text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text) # add spacing
text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
text = re.sub(" +", " ", text) # remove multiple spaces
text = text.strip() # strip white space at the ends
text = re.sub(r"http\S+", "", text) # remove links
return text
load_data(dataset_loc, num_samples=None)
Load data from source into a Ray Dataset.
Parameters: |
|
---|
Returns: |
|
---|
madewithml/data.py
def load_data(dataset_loc: str, num_samples: int = None) -> Dataset:
"""Load data from source into a Ray Dataset.
Args:
dataset_loc (str): Location of the dataset.
num_samples (int, optional): The number of samples to load. Defaults to None.
Returns:
Dataset: Our dataset represented by a Ray Dataset.
"""
ds = ray.data.read_csv(dataset_loc)
ds = ds.random_shuffle(seed=1234)
ds = ray.data.from_items(ds.take(num_samples)) if num_samples else ds
return ds
preprocess(df, class_to_index)
Preprocess the data in our dataframe.
Parameters: |
|
---|
Returns: |
|
---|
madewithml/data.py
def preprocess(df: pd.DataFrame, class_to_index: Dict) -> Dict:
"""Preprocess the data in our dataframe.
Args:
df (pd.DataFrame): Raw dataframe to preprocess.
class_to_index (Dict): Mapping of class names to indices.
Returns:
Dict: preprocessed data (ids, masks, targets).
"""
df["text"] = df.title + " " + df.description # feature engineering
df["text"] = df.text.apply(clean_text) # clean text
df = df.drop(columns=["id", "created_on", "title", "description"], errors="ignore") # clean dataframe
df = df[["text", "tag"]] # rearrange columns
df["tag"] = df["tag"].map(class_to_index) # label encoding
outputs = tokenize(df)
return outputs
stratify_split(ds, stratify, test_size, shuffle=True, seed=1234)
Split a dataset into train and test splits with equal amounts of data points from each class in the column we want to stratify on.
Parameters: |
|
---|
Returns: |
|
---|
madewithml/data.py
def stratify_split(
ds: Dataset,
stratify: str,
test_size: float,
shuffle: bool = True,
seed: int = 1234,
) -> Tuple[Dataset, Dataset]:
"""Split a dataset into train and test splits with equal
amounts of data points from each class in the column we
want to stratify on.
Args:
ds (Dataset): Input dataset to split.
stratify (str): Name of column to split on.
test_size (float): Proportion of dataset to split for test set.
shuffle (bool, optional): whether to shuffle the dataset. Defaults to True.
seed (int, optional): seed for shuffling. Defaults to 1234.
Returns:
Tuple[Dataset, Dataset]: the stratified train and test datasets.
"""
def _add_split(df: pd.DataFrame) -> pd.DataFrame: # pragma: no cover, used in parent function
"""Naively split a dataframe into train and test splits.
Add a column specifying whether it's the train or test split."""
train, test = train_test_split(df, test_size=test_size, shuffle=shuffle, random_state=seed)
train["_split"] = "train"
test["_split"] = "test"
return pd.concat([train, test])
def _filter_split(df: pd.DataFrame, split: str) -> pd.DataFrame: # pragma: no cover, used in parent function
"""Filter by data points that match the split column's value
and return the dataframe with the _split column dropped."""
return df[df["_split"] == split].drop("_split", axis=1)
# Train, test split with stratify
grouped = ds.groupby(stratify).map_groups(_add_split, batch_format="pandas") # group by each unique value in the column we want to stratify on
train_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "train"}, batch_format="pandas") # combine
test_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "test"}, batch_format="pandas") # combine
# Shuffle each split (required)
train_ds = train_ds.random_shuffle(seed=seed)
test_ds = test_ds.random_shuffle(seed=seed)
return train_ds, test_ds
tokenize(batch)
Tokenize the text input in our batch using a tokenizer.
Parameters: |
|
---|
Returns: |
|
---|
madewithml/data.py
def tokenize(batch: Dict) -> Dict:
"""Tokenize the text input in our batch using a tokenizer.
Args:
batch (Dict): batch of data with the text inputs to tokenize.
Returns:
Dict: batch of data with the results of tokenization (`input_ids` and `attention_mask`) on the text inputs.
"""
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
encoded_inputs = tokenizer(batch["text"].tolist(), return_tensors="np", padding="longest")
return dict(ids=encoded_inputs["input_ids"], masks=encoded_inputs["attention_mask"], targets=np.array(batch["tag"]))