hover.core.dataset

Dataset classes which extend beyond DataFrames.

When we supervise a collection of data, these operations need to be simple:

managing raw/train/dev/test subsets
transferring data points between subsets
pulling updates from annotation interfaces
pushing updates to annotation interfaces
getting a 2D embedding
loading data for training models

`SupervisableAudioDataset (SupervisableDataset)`

SupervisableDataset whose primary feature is audio.

Source code in hover/core/dataset.py

class SupervisableAudioDataset(SupervisableDataset):
    """
    ???+ note "`SupervisableDataset` whose primary feature is `audio`."
    """

    FEATURE_KEY = "audio"

`SupervisableDataset (Loggable)`

Feature-agnostic class for a dataset open to supervision.

Keeping a DataFrame form and a list-of-dicts ("dictl") form, with the intention that

the DataFrame form supports most kinds of operations;
the list-of-dicts form could be useful for manipulations outside the scope of pandas;
synchronization between the two forms should be called sparingly.

Source code in hover/core/dataset.py

class SupervisableDataset(Loggable):
    """
    ???+ note "Feature-agnostic class for a dataset open to supervision."

        Keeping a DataFrame form and a list-of-dicts ("dictl") form, with the intention that

        - the DataFrame form supports most kinds of operations;
        - the list-of-dicts form could be useful for manipulations outside the scope of pandas;
        - synchronization between the two forms should be called sparingly.
    """

    # 'scratch': intended to be directly editable by other objects, i.e. Explorers
    # labels will be stored but not used for information in hover itself
    SCRATCH_SUBSETS = tuple(["raw"])

    # non-'scratch': intended to be read-only outside of the class
    # 'public': labels will be considered as part of the classification task and will be used for built-in supervision
    PUBLIC_SUBSETS = tuple(["train", "dev"])
    # 'private': labels will be considered as part of the classification task and will NOT be used for supervision
    PRIVATE_SUBSETS = tuple(["test"])

    FEATURE_KEY = "feature"

    def __init__(self, *args, **kwargs):
        """
        ???+ note "Set up data subsets, widgets, and supplementary data structures."

            See `self.setup_dfs` for parameter details.
        """
        self._info("Initializing...")
        self.setup_dfs(*args, **kwargs)
        self.df_deduplicate()
        self.compute_feature_index()
        self.setup_widgets()
        # self.setup_label_coding() # redundant if setup_pop_table() immediately calls this again
        self.setup_file_export()
        self.setup_pop_table()
        self.setup_sel_table()
        self._vectorizer_lookup = OrderedDict()
        self._good(f"{self.__class__.__name__}: finished initialization.")

    def setup_dfs(
        self,
        raw_dictl,
        train_dictl=None,
        dev_dictl=None,
        test_dictl=None,
        feature_key="feature",
        label_key="label",
    ):
        """
        ???+ note "Subroutine of the constructor that creates standard-format DataFrames."

            | Param         | Type   | Description                          |
            | :------------ | :----- | :----------------------------------- |
            | `raw_dictl`   | `list` | list of dicts holding the **to-be-supervised** raw data |
            | `train_dictl` | `list` | list of dicts holding any **supervised** train data |
            | `dev_dictl`   | `list` | list of dicts holding any **supervised** dev data   |
            | `test_dictl`  | `list` | list of dicts holding any **supervised** test data  |
            | `feature_key` | `str`  | the key for the feature in each piece of data |
            | `label_key`   | `str`  | the key for the `**str**` label in supervised data |
        """

        def dictl_transform(dictl, labels=True):
            """
            Burner function to transform the input list of dictionaries into standard format.
            """
            # edge case when dictl is empty or None
            if not dictl:
                return []

            # transform the feature and possibly the label
            key_transform = {feature_key: self.__class__.FEATURE_KEY}
            if labels:
                key_transform[label_key] = "label"

            def burner(d):
                """
                Burner function to transform a single dict.
                """
                if labels:
                    assert label_key in d, f"Expected dict key {label_key}"

                trans_d = {key_transform.get(_k, _k): _v for _k, _v in d.items()}

                if not labels:
                    trans_d["label"] = module_config.ABSTAIN_DECODED

                return trans_d

            return [burner(_d) for _d in dictl]

        # standardize records
        dictls = {
            "raw": dictl_transform(raw_dictl, labels=False),
            "train": dictl_transform(train_dictl),
            "dev": dictl_transform(dev_dictl),
            "test": dictl_transform(test_dictl),
        }

        # initialize dataframes
        self.dfs = dict()
        for _key, _dictl in dictls.items():
            if _dictl:
                _df = pd.DataFrame(_dictl)
                assert self.__class__.FEATURE_KEY in _df.columns
                assert "label" in _df.columns
            else:
                _df = pd.DataFrame(columns=[self.__class__.FEATURE_KEY, "label"])

            self.dfs[_key] = _df

    def copy(self):
        """
        ???+ note "Create another instance, copying over the data entries."

            Also copy data structures that don't get created in the new instance.
        """
        dataset = self.__class__.from_pandas(self.to_pandas())
        dataset._vectorizer_lookup.update(self._vectorizer_lookup)
        return dataset

    def compute_feature_index(self):
        """
        ???+ note "Allow lookup by feature value without setting it as the index."

            Assumes that feature values are unique. The reason not to just set the feature as the index is because integer indices work smoothly with Bokeh `DataSource`s, NumPy `array`s, and Torch `Tensor`s.
        """
        feature_to_subset_idx = {}
        for _subset, _df in self.dfs.items():
            _values = _df[self.__class__.FEATURE_KEY].values
            for i, _val in enumerate(_values):
                if _val in feature_to_subset_idx:
                    raise ValueError(
                        f"Expected unique feature values, found duplicate {_val}"
                    )
                feature_to_subset_idx[_val] = (_subset, i)
        self.feature_to_subset_idx = feature_to_subset_idx

    def locate_by_feature_value(self, value, auto_recompute=True):
        """
        ???+ note "Find the subset and index given a feature value."

            Assumes that the value is present and detects if the subset and index found is consistent with the value.
        """
        subset, index = self.feature_to_subset_idx[value]

        current_value = self.dfs[subset].at[index, self.__class__.FEATURE_KEY]
        if current_value != value:
            if auto_recompute:
                self._warn("locate_by_feature_value mismatch. Recomputing index.")
                self.compute_feature_index()
                # if ever need to recompute twice, there must be a bug
                return self.locate_by_feature_value(value, auto_recompute=False)
            else:
                raise ValueError("locate_by_feature_value mismatch.")
        return subset, index

    def to_pandas(self):
        """
        ???+ note "Export to a pandas DataFrame."
        """
        dfs = []
        for _subset in ["raw", "train", "dev", "test"]:
            _df = self.dfs[_subset].copy()
            _df[DATASET_SUBSET_FIELD] = _subset
            dfs.append(_df)

        return pd.concat(dfs, axis=0)

    @classmethod
    def from_pandas(cls, df, **kwargs):
        """
        ???+ note "Import from a pandas DataFrame."
            | Param    | Type   | Description                          |
            | :------- | :----- | :----------------------------------- |
            | `df` | `DataFrame` | with a "SUBSET" field dividing subsets |
        """
        SUBSETS = cls.SCRATCH_SUBSETS + cls.PUBLIC_SUBSETS + cls.PRIVATE_SUBSETS

        if DATASET_SUBSET_FIELD not in df.columns:
            raise ValueError(
                f"Expecting column '{DATASET_SUBSET_FIELD}' in the DataFrame which takes values from {SUBSETS}"
            )

        dictls = {}
        for _subset in ["raw", "train", "dev", "test"]:
            _sub_df = df[df[DATASET_SUBSET_FIELD] == _subset]
            dictls[_subset] = _sub_df.to_dict(orient="records")

        return cls(
            raw_dictl=dictls["raw"],
            train_dictl=dictls["train"],
            dev_dictl=dictls["dev"],
            test_dictl=dictls["test"],
            **kwargs,
        )

    def setup_widgets(self):
        """
        ???+ note "Create `bokeh` widgets for interactive data management."

            Operations:
            -   PUSH: push updated dataframes to linked `explorer`s.
            -   COMMIT: added selected points to a specific subset `dataframe`.
            -   DEDUP: cross-deduplicate across all subset `dataframe`s.
            -   VIEW: view selected points of linked `explorer`s.
                -   the link can be different from that for PUSH. Typically all the `explorer`s sync their selections, and only an `annotator` is linked to the `dataset`.
            -   PATCH: update a few edited rows from VIEW result to the dataset.
            -   EVICT: remove a few rows from both VIEW result and linked `explorer` selection.
        """
        self.update_pusher = Button(
            label="Push",
            button_type="success",
        )
        self.data_committer = Dropdown(
            label="Commit",
            button_type="warning",
            menu=[*self.__class__.PUBLIC_SUBSETS, *self.__class__.PRIVATE_SUBSETS],
        )
        self.dedup_trigger = Button(
            label="Dedup",
            button_type="warning",
        )
        self.selection_viewer = Button(
            label="View Selected",
            button_type="primary",
        )
        self.selection_patcher = Button(
            label="Update Row Values",
            button_type="warning",
        )
        self.selection_evictor = Button(
            label="Evict Rows from Selection",
            button_type="primary",
        )

        self.selection_table_refresh_box = CheckboxGroup(
            labels=["auto refresh selection table"], active=[]
        )
        self.help_div = dataset_help_widget()

        def commit_base_callback():
            """
            COMMIT creates cross-duplicates between subsets.
            Changes dataset rows.
            No change to explorers.

            - PUSH shall be blocked until DEDUP is executed.
            - PATCH shall be blocked until PUSH is executed.
            - EVICT shall be blocked until PUSH is executed.
            """
            self.dedup_trigger.disabled = False
            self.update_pusher.disabled = True
            self.selection_patcher.disabled = True
            self.selection_evictor.disabled = True

        def dedup_base_callback():
            """
            DEDUP re-creates dfs with different indices than before.
            Changes dataset rows.
            No change to explorers.

            - COMMIT shall be blocked until PUSH is executed.
            - PATCH shall be blocked until PUSH is executed.
            - EVICT shall be blocked until PUSH is executed.
            """
            self.update_pusher.disabled = False
            self.data_committer.disabled = True
            self.selection_patcher.disabled = True
            self.selection_evictor.disabled = True
            self.df_deduplicate()

        def push_base_callback():
            """
            PUSH enforces df consistency with all linked explorers.
            No change to dataset rows.
            Changes explorers.

            - DEDUP could be blocked because it stays trivial until COMMIT is executed.
            """
            self.data_committer.disabled = False
            self.dedup_trigger.disabled = True
            # empty the selection table, then allow PATCH and EVICT
            self.sel_table.source.data = dict()
            self.sel_table.source.selected.indices = []
            self.selection_patcher.disabled = False
            self.selection_evictor.disabled = False

        self.update_pusher.on_click(push_base_callback)
        self.data_committer.on_click(commit_base_callback)
        self.dedup_trigger.on_click(dedup_base_callback)

    def view(self):
        """
        ???+ note "Defines the layout of `bokeh` objects when visualized."
        """
        # local import to avoid naming confusion/conflicts
        from bokeh.layouts import row, column

        return column(
            self.help_div,
            # population table and directly associated widgets
            row(
                self.update_pusher,
                self.data_committer,
                self.dedup_trigger,
                self.file_exporter,
            ),
            self.pop_table,
            # selection table and directly associated widgets
            row(
                self.selection_viewer,
                self.selection_patcher,
                self.selection_evictor,
            ),
            row(
                self.selection_table_refresh_box,
            ),
            self.sel_table,
        )

    def subscribe_update_push(self, explorer, subset_mapping):
        """
        ???+ note "Enable pushing updated DataFrames to explorers that depend on them."
            | Param            | Type   | Description                            |
            | :--------------- | :----- | :------------------------------------- |
            | `explorer`       | `BokehBaseExplorer` | the explorer to register  |
            | `subset_mapping` | `dict` | `dataset` -> `explorer` subset mapping |

            Note: the reason we need this is due to `self.dfs[key] = ...`-like assignments. If DF operations were all in-place, then the explorers could directly access the updates through their `self.dfs` references.
        """

        explorer.link_dataset(self)

        def callback_push():
            df_dict = {_v: self.dfs[_k] for _k, _v in subset_mapping.items()}
            explorer._setup_dfs(df_dict)
            explorer._update_sources()

        self.update_pusher.on_click(callback_push)
        self._good(
            f"Subscribed {explorer.__class__.__name__} to dataset pushes: {subset_mapping}"
        )

    def subscribe_data_commit(self, explorer, subset_mapping):
        """
        ???+ note "Enable committing data across subsets, specified by a selection in an explorer and a dropdown widget of the dataset."
            | Param            | Type   | Description                            |
            | :--------------- | :----- | :------------------------------------- |
            | `explorer`       | `BokehBaseExplorer` | the explorer to register  |
            | `subset_mapping` | `dict` | `dataset` -> `explorer` subset mapping |
        """

        explorer.link_dataset(self)

        def callback_commit(event):
            for sub_k, sub_v in subset_mapping.items():
                sub_to = event.item
                selected_idx = explorer.sources[sub_v].selected.indices
                if not selected_idx:
                    self._warn(
                        f"Attempting data commit: did not select any data points in subset {sub_v}."
                    )
                    return

                sel_slice = self.dfs[sub_k].iloc[selected_idx]
                valid_slice = sel_slice[
                    sel_slice["label"] != module_config.ABSTAIN_DECODED
                ]

                # concat to the end and do some accounting
                size_before = self.dfs[sub_to].shape[0]
                self.dfs[sub_to] = pd.concat(
                    [self.dfs[sub_to], valid_slice],
                    axis=0,
                    sort=False,
                    ignore_index=True,
                )
                size_mid = self.dfs[sub_to].shape[0]
                self.dfs[sub_to].drop_duplicates(
                    subset=[self.__class__.FEATURE_KEY], keep="last", inplace=True
                )
                size_after = self.dfs[sub_to].shape[0]

                self._info(
                    f"Committed {valid_slice.shape[0]} (valid out of {sel_slice.shape[0]} selected) entries from {sub_k} to {sub_to} ({size_before} -> {size_after} with {size_mid-size_after} overwrites)."
                )
            # chain another callback
            self._callback_update_population()

        self.data_committer.on_click(callback_commit)
        self._good(
            f"Subscribed {explorer.__class__.__name__} to dataset commits: {subset_mapping}"
        )

    def subscribe_selection_view(self, explorer, subsets):
        """
        ???+ note "Enable viewing groups of data entries, specified by a selection in an explorer."
            | Param            | Type   | Description                            |
            | :--------------- | :----- | :------------------------------------- |
            | `explorer`       | `BokehBaseExplorer` | the explorer to register  |
            | `subsets`        | `list` | subset selections to consider          |
        """
        assert (
            isinstance(subsets, list) and len(subsets) > 0
        ), "Expected a non-empty list of subsets"

        explorer.link_dataset(self)

        def callback_view():
            sel_slices = []
            for subset in subsets:
                selected_idx = sorted(explorer.sources[subset].selected.indices)
                sub_slice = explorer.dfs[subset].iloc[selected_idx]
                sel_slices.append(sub_slice)

            selected = pd.concat(sel_slices, axis=0)
            self._callback_update_selection(selected)

        def callback_view_refresh():
            if 0 in self.selection_table_refresh_box.active:
                callback_view()

        def callback_evict():
            # create sets for fast index discarding
            subset_to_indicies = {}
            for subset in subsets:
                indicies = set(explorer.sources[subset].selected.indices)
                subset_to_indicies[subset] = indicies

            # from datatable index, get feature values to look up dataframe index
            sel_source = self.sel_table.source
            raw_indicies = sel_source.selected.indices
            for i in raw_indicies:
                feature_value = sel_source.data[self.__class__.FEATURE_KEY][i]
                subset, idx = self.locate_by_feature_value(feature_value)
                subset_to_indicies[subset].discard(idx)

            # assign indices back to change actual selection
            for subset in subsets:
                indicies = sorted(list(subset_to_indicies[subset]))
                explorer.sources[subset].selected.indices = indicies

            self._good(
                f"Selection table: evicted {len(raw_indicies)} points from selection."
            )
            # refresh the selection table
            callback_view()

        explorer._register_selection_callback("read", callback_view_refresh)
        self.selection_viewer.on_click(callback_view)
        self.selection_evictor.on_click(callback_evict)
        self._good(
            f"Subscribed {explorer.__class__.__name__} to selection table: {subsets}"
        )

    def setup_label_coding(self, verbose=True, debug=False):
        """
        ???+ note "Auto-determine labels in the dataset, then create encoder/decoder in lexical order."
            Add `"ABSTAIN"` as a no-label placeholder which gets ignored categorically.

            | Param     | Type   | Description                        |
            | :-------- | :----- | :--------------------------------- |
            | `verbose` | `bool` | whether to log verbosely           |
            | `debug`   | `bool` | whether to enable label validation |
        """
        all_labels = set()
        for _key in [*self.__class__.PUBLIC_SUBSETS, *self.__class__.PRIVATE_SUBSETS]:
            _df = self.dfs[_key]
            _found_labels = set(_df["label"].tolist())
            all_labels = all_labels.union(_found_labels)

        # exclude ABSTAIN from self.classes, but include it in the encoding
        all_labels.discard(module_config.ABSTAIN_DECODED)
        self.classes = sorted(all_labels)
        self.label_encoder = {
            **{_label: _i for _i, _label in enumerate(self.classes)},
            module_config.ABSTAIN_DECODED: module_config.ABSTAIN_ENCODED,
        }
        self.label_decoder = {_v: _k for _k, _v in self.label_encoder.items()}

        if verbose:
            self._good(
                f"Set up label encoder/decoder with {len(self.classes)} classes."
            )
        if debug:
            self.validate_labels()

    def validate_labels(self, raise_exception=True):
        """
        ???+ note "Assert that every label is in the encoder."

            | Param             | Type   | Description                         |
            | :---------------- | :----- | :---------------------------------- |
            | `raise_exception` | `bool` | whether to raise errors when failed |
        """
        for _key in [*self.__class__.PUBLIC_SUBSETS, *self.__class__.PRIVATE_SUBSETS]:
            _invalid_indices = None
            assert "label" in self.dfs[_key].columns
            _mask = self.dfs[_key]["label"].apply(
                lambda x: int(x in self.label_encoder)
            )
            # DO NOT change the "==" to "is"; False in pandas is not False below
            _invalid_indices = np.where(_mask == 0)[0].tolist()
            if _invalid_indices:
                self._fail(f"Subset {_key} has invalid labels:")
                self._print(self.dfs[_key].loc[_invalid_indices])
                if raise_exception:
                    raise ValueError("invalid labels")

    def setup_file_export(self):
        self.file_exporter = Dropdown(
            label="Export",
            button_type="warning",
            menu=["Excel", "CSV", "JSON", "pickle"],
            height_policy="fit",
            width_policy="min",
        )

        def callback_export(event, path_root=None):
            """
            A callback on clicking the 'self.annotator_export' button.
            Saves the dataframe to a pickle.
            """
            export_format = event.item

            # auto-determine the export path root
            if path_root is None:
                timestamp = current_time("%Y%m%d%H%M%S")
                export_dir = module_config.DATA_SAVE_DIR
                path_root = os.path.join(export_dir, f"hover-dataset-{timestamp}")

            export_df = self.to_pandas()

            if export_format == "Excel":
                export_path = f"{path_root}.xlsx"
                export_df.to_excel(export_path, index=False)
            elif export_format == "CSV":
                export_path = f"{path_root}.csv"
                export_df.to_csv(export_path, index=False)
            elif export_format == "JSON":
                export_path = f"{path_root}.json"
                export_df.to_json(export_path, orient="records")
            elif export_format == "pickle":
                export_path = f"{path_root}.pkl"
                export_df.to_pickle(export_path)
            else:
                raise ValueError(f"Unexpected export format {export_format}")

            self._good(f"saved Pandas DataFrame version to {export_path}")

        # assign the callback, keeping its reference
        self._callback_export = callback_export
        self.file_exporter.on_click(self._callback_export)

    def setup_pop_table(self, **kwargs):
        """
        ???+ note "Set up a bokeh `DataTable` widget for monitoring subset data populations."

            | Param      | Type   | Description                  |
            | :--------- | :----- | :--------------------------- |
            | `**kwargs` |        | forwarded to the `DataTable` |
        """
        subsets = [
            *self.__class__.SCRATCH_SUBSETS,
            *self.__class__.PUBLIC_SUBSETS,
            *self.__class__.PRIVATE_SUBSETS,
        ]
        pop_source = ColumnDataSource(dict())
        pop_columns = [
            TableColumn(field="label", title="label"),
            *[
                TableColumn(field=f"count_{_subset}", title=_subset)
                for _subset in subsets
            ],
            TableColumn(
                field="color",
                title="color",
                formatter=HTMLTemplateFormatter(template=COLOR_GLYPH_TEMPLATE),
            ),
        ]
        self.pop_table = DataTable(source=pop_source, columns=pop_columns, **kwargs)

        def update_population():
            """
            Callback function.
            """
            # make sure that the label coding is correct
            self.setup_label_coding()

            # re-compute label population
            eff_labels = [module_config.ABSTAIN_DECODED, *self.classes]
            color_dict = auto_label_color(self.classes)
            eff_colors = [color_dict[_label] for _label in eff_labels]

            pop_data = dict(color=eff_colors, label=eff_labels)
            for _subset in subsets:
                _subpop = self.dfs[_subset]["label"].value_counts()
                pop_data[f"count_{_subset}"] = [
                    _subpop.get(_label, 0) for _label in eff_labels
                ]

            # push results to bokeh data source
            pop_source.data = pop_data

            self._good(
                f"Population updater: latest population with {len(self.classes)} classes."
            )

        update_population()
        self.dedup_trigger.on_click(update_population)

        # store the callback so that it can be referenced by other methods
        self._callback_update_population = update_population

    def setup_sel_table(self, **kwargs):
        """
        ???+ note "Set up a bokeh `DataTable` widget for viewing selected data points."

            | Param      | Type   | Description                  |
            | :--------- | :----- | :--------------------------- |
            | `**kwargs` |        | forwarded to the `DataTable` |
        """

        sel_source = ColumnDataSource(dict())
        sel_columns = dataset_default_sel_table_columns(self.__class__.FEATURE_KEY)
        table_kwargs = dataset_default_sel_table_kwargs(self.__class__.FEATURE_KEY)
        table_kwargs.update(kwargs)
        self.sel_table = DataTable(
            source=sel_source, columns=sel_columns, **table_kwargs
        )

        def update_selection(selected_df):
            """
            To be triggered as a subroutine of `self.selection_viewer`.
            """
            sel_source.data = selected_df.to_dict(orient="list")
            # now that selection table has changed, clear sub-selection
            sel_source.selected.indices = []

            self._good(
                f"Selection table: latest selection with {selected_df.shape[0]} entries."
            )

        self._callback_update_selection = update_selection

        def patch_edited_selection():
            sel_source = self.sel_table.source
            raw_indices = sel_source.selected.indices
            for i in raw_indices:
                feature_value = sel_source.data[self.__class__.FEATURE_KEY][i]
                subset, idx = self.locate_by_feature_value(feature_value)
                for key in sel_source.data.keys():
                    self.dfs[subset].at[idx, key] = sel_source.data[key][i]

            self._good(f"Selection table: edited {len(raw_indices)} dataset rows.")
            # if edited labels (which is common), then population has changed
            self._callback_update_population()

        self.selection_patcher.on_click(patch_edited_selection)

    def df_deduplicate(self):
        """
        ???+ note "Cross-deduplicate data entries by feature between subsets."
        """
        self._info("Deduplicating...")
        # for data entry accounting
        before, after = dict(), dict()

        # deduplicating rule: entries that come LATER are of higher priority
        ordered_subsets = [
            *self.__class__.SCRATCH_SUBSETS,
            *self.__class__.PUBLIC_SUBSETS,
            *self.__class__.PRIVATE_SUBSETS,
        ]

        # keep track of which df has which columns and which rows came from which subset
        columns = dict()
        for _key in ordered_subsets:
            before[_key] = self.dfs[_key].shape[0]
            columns[_key] = self.dfs[_key].columns
            self.dfs[_key]["__subset"] = _key

        # concatenate in order and deduplicate
        overall_df = pd.concat(
            [self.dfs[_key] for _key in ordered_subsets], axis=0, sort=False
        )
        overall_df.drop_duplicates(
            subset=[self.__class__.FEATURE_KEY], keep="last", inplace=True
        )
        overall_df.reset_index(drop=True, inplace=True)

        # cut up slices
        for _key in ordered_subsets:
            self.dfs[_key] = overall_df[overall_df["__subset"] == _key].reset_index(
                drop=True, inplace=False
            )[columns[_key]]
            after[_key] = self.dfs[_key].shape[0]
            self._info(f"--subset {_key} rows: {before[_key]} -> {after[_key]}.")

        self.compute_feature_index()

    @property
    def vectorizer_lookup(self):
        return self._vectorizer_lookup

    @vectorizer_lookup.setter
    def vectorizer_lookup(self, *args, **kwargs):
        self._fail("assigning vectorizer lookup by reference is forbidden.")

    def compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs):
        """
        ???+ note "Get embeddings in n-dimensional space and return the dimensionality reducer."
            Reference: [`DimensionalityReducer`](https://github.com/phurwicz/hover/blob/main/hover/core/representation/reduction.py)

            | Param        | Type       | Description                        |
            | :----------- | :--------- | :--------------------------------- |
            | `vectorizer` | `callable` | the feature -> vector function     |
            | `method`     | `str`      | arg for `DimensionalityReducer`    |
            | `dimension`  | `int`      | dimension of output embedding      |
            | `**kwargs`   |            | kwargs for `DimensionalityReducer` |
        """
        from hover.core.representation.reduction import DimensionalityReducer

        if method is None:
            method = hover.config["data.embedding"]["default_reduction_method"]
        # register the vectorizer for scenarios that may need it
        self.vectorizer_lookup[dimension] = vectorizer

        # prepare input vectors to manifold learning
        fit_subset = [*self.__class__.SCRATCH_SUBSETS, *self.__class__.PUBLIC_SUBSETS]
        trans_subset = [*self.__class__.PRIVATE_SUBSETS]

        assert not set(fit_subset).intersection(set(trans_subset)), "Unexpected overlap"
        assert isinstance(dimension, int) and dimension >= 2
        embedding_cols = [embedding_field(dimension, i) for i in range(dimension)]

        # compute vectors and keep track which where to slice the array for fitting
        feature_inp = []
        for _key in fit_subset:
            feature_inp.extend(self.dfs[_key][self.__class__.FEATURE_KEY].tolist())
        fit_num = len(feature_inp)
        for _key in trans_subset:
            feature_inp.extend(self.dfs[_key][self.__class__.FEATURE_KEY].tolist())
        trans_arr = np.array(
            [vectorizer(_inp) for _inp in tqdm(feature_inp, desc="Vectorizing")]
        )

        # initialize and fit manifold learning reducer using specified subarray
        self._info(f"Fit-transforming {method.upper()} on {fit_num} samples...")
        reducer = DimensionalityReducer(trans_arr[:fit_num])
        fit_embedding = reducer.fit_transform(method, dimension=dimension, **kwargs)

        # compute embedding of the whole dataset
        self._info(
            f"Transforming {method.upper()} on {trans_arr.shape[0]-fit_num} samples..."
        )
        trans_embedding = reducer.transform(trans_arr[fit_num:], method)

        # assign x and y coordinates to dataset
        start_idx = 0
        for _subset, _embedding in [
            (fit_subset, fit_embedding),
            (trans_subset, trans_embedding),
        ]:
            # edge case: embedding is too small
            if _embedding.shape[0] < 1:
                for _key in _subset:
                    assert (
                        self.dfs[_key].shape[0] == 0
                    ), "Expected empty df due to empty embedding"
                continue
            for _key in _subset:
                _length = self.dfs[_key].shape[0]
                for _i in range(dimension):
                    _col = embedding_cols[_i]
                    self.dfs[_key][_col] = pd.Series(
                        _embedding[start_idx : (start_idx + _length), _i]
                    )
                start_idx += _length

        self._good(f"Computed {dimension}-d embedding in columns {embedding_cols}")
        return reducer

    def compute_2d_embedding(self, vectorizer, method=None, **kwargs):
        """
        ???+ note "Get embeddings in the xy-plane and return the dimensionality reducer."
            A special case of `compute_nd_embedding`.

            | Param        | Type       | Description                        |
            | :----------- | :--------- | :--------------------------------- |
            | `vectorizer` | `callable` | the feature -> vector function     |
            | `method`     | `str`      | arg for `DimensionalityReducer`    |
            | `**kwargs`   |            | kwargs for `DimensionalityReducer` |
        """
        reducer = self.compute_nd_embedding(
            vectorizer, method=None, dimension=2, **kwargs
        )
        return reducer

    def loader(self, key, *vectorizers, batch_size=64, smoothing_coeff=0.0):
        """
        ???+ note "Prepare a torch `Dataloader` for training or evaluation."
            | Param         | Type          | Description                        |
            | :------------ | :------------ | :--------------------------------- |
            | `key`         | `str`         | subset of data, e.g. `"train"`     |
            | `vectorizers` | `callable`(s) | the feature -> vector function(s)  |
            | `batch_size`  | `int`         | size per batch                     |
            | `smoothing_coeff` | `float`   | portion of probability to equally split between classes |
        """
        # lazy import: missing torch should not break the rest of the class
        from hover.utils.torch_helper import (
            VectorDataset,
            MultiVectorDataset,
            one_hot,
            label_smoothing,
        )

        # take the slice that has a meaningful label
        df = self.dfs[key][self.dfs[key]["label"] != module_config.ABSTAIN_DECODED]

        # edge case: valid slice is too small
        if df.shape[0] < 1:
            raise ValueError(f"Subset {key} has too few samples ({df.shape[0]})")
        batch_size = min(batch_size, df.shape[0])

        # prepare output vectors
        labels = df["label"].apply(lambda x: self.label_encoder[x]).tolist()
        output_vectors = one_hot(labels, num_classes=len(self.classes))
        if smoothing_coeff > 0.0:
            output_vectors = label_smoothing(
                output_vectors, coefficient=smoothing_coeff
            )

        # prepare input vectors
        assert len(vectorizers) > 0, "Expected at least one vectorizer"
        multi_flag = len(vectorizers) > 1
        features = df[self.__class__.FEATURE_KEY].tolist()

        input_vector_lists = []
        for _vec_func in vectorizers:
            self._info(f"Preparing {key} input vectors...")
            _input_vecs = [_vec_func(_f) for _f in tqdm(features, desc="Vectorizing")]
            input_vector_lists.append(_input_vecs)

        self._info(f"Preparing {key} data loader...")
        if multi_flag:
            assert len(input_vector_lists) > 1, "Expected multiple lists of vectors"
            loader = MultiVectorDataset(input_vector_lists, output_vectors).loader(
                batch_size=batch_size
            )
        else:
            assert len(input_vector_lists) == 1, "Expected only one list of vectors"
            input_vectors = input_vector_lists[0]
            loader = VectorDataset(input_vectors, output_vectors).loader(
                batch_size=batch_size
            )
        self._good(
            f"Prepared {key} loader with {len(features)} examples; {len(vectorizers)} vectors per feature, batch size {batch_size}"
        )
        return loader

`init(self, *args, **kwargs)` `special`

Set up data subsets, widgets, and supplementary data structures.

See self.setup_dfs for parameter details.

Source code in hover/core/dataset.py

def __init__(self, *args, **kwargs):
    """
    ???+ note "Set up data subsets, widgets, and supplementary data structures."

        See `self.setup_dfs` for parameter details.
    """
    self._info("Initializing...")
    self.setup_dfs(*args, **kwargs)
    self.df_deduplicate()
    self.compute_feature_index()
    self.setup_widgets()
    # self.setup_label_coding() # redundant if setup_pop_table() immediately calls this again
    self.setup_file_export()
    self.setup_pop_table()
    self.setup_sel_table()
    self._vectorizer_lookup = OrderedDict()
    self._good(f"{self.__class__.__name__}: finished initialization.")

`compute_2d_embedding(self, vectorizer, method=None, **kwargs)`

Get embeddings in the xy-plane and return the dimensionality reducer.

A special case of compute_nd_embedding.

Param	Type	Description
`vectorizer`	`callable`	the feature -> vector function
`method`	`str`	arg for `DimensionalityReducer`
`**kwargs`		kwargs for `DimensionalityReducer`

Source code in hover/core/dataset.py

def compute_2d_embedding(self, vectorizer, method=None, **kwargs):
    """
    ???+ note "Get embeddings in the xy-plane and return the dimensionality reducer."
        A special case of `compute_nd_embedding`.

        | Param        | Type       | Description                        |
        | :----------- | :--------- | :--------------------------------- |
        | `vectorizer` | `callable` | the feature -> vector function     |
        | `method`     | `str`      | arg for `DimensionalityReducer`    |
        | `**kwargs`   |            | kwargs for `DimensionalityReducer` |
    """
    reducer = self.compute_nd_embedding(
        vectorizer, method=None, dimension=2, **kwargs
    )
    return reducer

`compute_feature_index(self)`

Allow lookup by feature value without setting it as the index.

Assumes that feature values are unique. The reason not to just set the feature as the index is because integer indices work smoothly with Bokeh DataSources, NumPy arrays, and Torch Tensors.

Source code in hover/core/dataset.py

def compute_feature_index(self):
    """
    ???+ note "Allow lookup by feature value without setting it as the index."

        Assumes that feature values are unique. The reason not to just set the feature as the index is because integer indices work smoothly with Bokeh `DataSource`s, NumPy `array`s, and Torch `Tensor`s.
    """
    feature_to_subset_idx = {}
    for _subset, _df in self.dfs.items():
        _values = _df[self.__class__.FEATURE_KEY].values
        for i, _val in enumerate(_values):
            if _val in feature_to_subset_idx:
                raise ValueError(
                    f"Expected unique feature values, found duplicate {_val}"
                )
            feature_to_subset_idx[_val] = (_subset, i)
    self.feature_to_subset_idx = feature_to_subset_idx

`compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs)`

Get embeddings in n-dimensional space and return the dimensionality reducer.

Reference: DimensionalityReducer

Param	Type	Description
`vectorizer`	`callable`	the feature -> vector function
`method`	`str`	arg for `DimensionalityReducer`
`dimension`	`int`	dimension of output embedding
`**kwargs`		kwargs for `DimensionalityReducer`

Source code in hover/core/dataset.py

def compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs):
    """
    ???+ note "Get embeddings in n-dimensional space and return the dimensionality reducer."
        Reference: [`DimensionalityReducer`](https://github.com/phurwicz/hover/blob/main/hover/core/representation/reduction.py)

        | Param        | Type       | Description                        |
        | :----------- | :--------- | :--------------------------------- |
        | `vectorizer` | `callable` | the feature -> vector function     |
        | `method`     | `str`      | arg for `DimensionalityReducer`    |
        | `dimension`  | `int`      | dimension of output embedding      |
        | `**kwargs`   |            | kwargs for `DimensionalityReducer` |
    """
    from hover.core.representation.reduction import DimensionalityReducer

    if method is None:
        method = hover.config["data.embedding"]["default_reduction_method"]
    # register the vectorizer for scenarios that may need it
    self.vectorizer_lookup[dimension] = vectorizer

    # prepare input vectors to manifold learning
    fit_subset = [*self.__class__.SCRATCH_SUBSETS, *self.__class__.PUBLIC_SUBSETS]
    trans_subset = [*self.__class__.PRIVATE_SUBSETS]

    assert not set(fit_subset).intersection(set(trans_subset)), "Unexpected overlap"
    assert isinstance(dimension, int) and dimension >= 2
    embedding_cols = [embedding_field(dimension, i) for i in range(dimension)]

    # compute vectors and keep track which where to slice the array for fitting
    feature_inp = []
    for _key in fit_subset:
        feature_inp.extend(self.dfs[_key][self.__class__.FEATURE_KEY].tolist())
    fit_num = len(feature_inp)
    for _key in trans_subset:
        feature_inp.extend(self.dfs[_key][self.__class__.FEATURE_KEY].tolist())
    trans_arr = np.array(
        [vectorizer(_inp) for _inp in tqdm(feature_inp, desc="Vectorizing")]
    )

    # initialize and fit manifold learning reducer using specified subarray
    self._info(f"Fit-transforming {method.upper()} on {fit_num} samples...")
    reducer = DimensionalityReducer(trans_arr[:fit_num])
    fit_embedding = reducer.fit_transform(method, dimension=dimension, **kwargs)

    # compute embedding of the whole dataset
    self._info(
        f"Transforming {method.upper()} on {trans_arr.shape[0]-fit_num} samples..."
    )
    trans_embedding = reducer.transform(trans_arr[fit_num:], method)

    # assign x and y coordinates to dataset
    start_idx = 0
    for _subset, _embedding in [
        (fit_subset, fit_embedding),
        (trans_subset, trans_embedding),
    ]:
        # edge case: embedding is too small
        if _embedding.shape[0] < 1:
            for _key in _subset:
                assert (
                    self.dfs[_key].shape[0] == 0
                ), "Expected empty df due to empty embedding"
            continue
        for _key in _subset:
            _length = self.dfs[_key].shape[0]
            for _i in range(dimension):
                _col = embedding_cols[_i]
                self.dfs[_key][_col] = pd.Series(
                    _embedding[start_idx : (start_idx + _length), _i]
                )
            start_idx += _length

    self._good(f"Computed {dimension}-d embedding in columns {embedding_cols}")
    return reducer

`copy(self)`

Create another instance, copying over the data entries.

Also copy data structures that don't get created in the new instance.

Source code in hover/core/dataset.py

def copy(self):
    """
    ???+ note "Create another instance, copying over the data entries."

        Also copy data structures that don't get created in the new instance.
    """
    dataset = self.__class__.from_pandas(self.to_pandas())
    dataset._vectorizer_lookup.update(self._vectorizer_lookup)
    return dataset

`df_deduplicate(self)`

Cross-deduplicate data entries by feature between subsets.

Source code in hover/core/dataset.py

def df_deduplicate(self):
    """
    ???+ note "Cross-deduplicate data entries by feature between subsets."
    """
    self._info("Deduplicating...")
    # for data entry accounting
    before, after = dict(), dict()

    # deduplicating rule: entries that come LATER are of higher priority
    ordered_subsets = [
        *self.__class__.SCRATCH_SUBSETS,
        *self.__class__.PUBLIC_SUBSETS,
        *self.__class__.PRIVATE_SUBSETS,
    ]

    # keep track of which df has which columns and which rows came from which subset
    columns = dict()
    for _key in ordered_subsets:
        before[_key] = self.dfs[_key].shape[0]
        columns[_key] = self.dfs[_key].columns
        self.dfs[_key]["__subset"] = _key

    # concatenate in order and deduplicate
    overall_df = pd.concat(
        [self.dfs[_key] for _key in ordered_subsets], axis=0, sort=False
    )
    overall_df.drop_duplicates(
        subset=[self.__class__.FEATURE_KEY], keep="last", inplace=True
    )
    overall_df.reset_index(drop=True, inplace=True)

    # cut up slices
    for _key in ordered_subsets:
        self.dfs[_key] = overall_df[overall_df["__subset"] == _key].reset_index(
            drop=True, inplace=False
        )[columns[_key]]
        after[_key] = self.dfs[_key].shape[0]
        self._info(f"--subset {_key} rows: {before[_key]} -> {after[_key]}.")

    self.compute_feature_index()

`from_pandas(df, **kwargs)` `classmethod`

Import from a pandas DataFrame.

Param	Type	Description
`df`	`DataFrame`	with a "SUBSET" field dividing subsets

Source code in hover/core/dataset.py

@classmethod
def from_pandas(cls, df, **kwargs):
    """
    ???+ note "Import from a pandas DataFrame."
        | Param    | Type   | Description                          |
        | :------- | :----- | :----------------------------------- |
        | `df` | `DataFrame` | with a "SUBSET" field dividing subsets |
    """
    SUBSETS = cls.SCRATCH_SUBSETS + cls.PUBLIC_SUBSETS + cls.PRIVATE_SUBSETS

    if DATASET_SUBSET_FIELD not in df.columns:
        raise ValueError(
            f"Expecting column '{DATASET_SUBSET_FIELD}' in the DataFrame which takes values from {SUBSETS}"
        )

    dictls = {}
    for _subset in ["raw", "train", "dev", "test"]:
        _sub_df = df[df[DATASET_SUBSET_FIELD] == _subset]
        dictls[_subset] = _sub_df.to_dict(orient="records")

    return cls(
        raw_dictl=dictls["raw"],
        train_dictl=dictls["train"],
        dev_dictl=dictls["dev"],
        test_dictl=dictls["test"],
        **kwargs,
    )

`loader(self, key, vectorizers, , batch_size=64, smoothing_coeff=0.0)`

Prepare a torch Dataloader for training or evaluation.

Param	Type	Description
`key`	`str`	subset of data, e.g. `"train"`
`vectorizers`	`callable`(s)	the feature -> vector function(s)
`batch_size`	`int`	size per batch
`smoothing_coeff`	`float`	portion of probability to equally split between classes

Source code in hover/core/dataset.py

def loader(self, key, *vectorizers, batch_size=64, smoothing_coeff=0.0):
    """
    ???+ note "Prepare a torch `Dataloader` for training or evaluation."
        | Param         | Type          | Description                        |
        | :------------ | :------------ | :--------------------------------- |
        | `key`         | `str`         | subset of data, e.g. `"train"`     |
        | `vectorizers` | `callable`(s) | the feature -> vector function(s)  |
        | `batch_size`  | `int`         | size per batch                     |
        | `smoothing_coeff` | `float`   | portion of probability to equally split between classes |
    """
    # lazy import: missing torch should not break the rest of the class
    from hover.utils.torch_helper import (
        VectorDataset,
        MultiVectorDataset,
        one_hot,
        label_smoothing,
    )

    # take the slice that has a meaningful label
    df = self.dfs[key][self.dfs[key]["label"] != module_config.ABSTAIN_DECODED]

    # edge case: valid slice is too small
    if df.shape[0] < 1:
        raise ValueError(f"Subset {key} has too few samples ({df.shape[0]})")
    batch_size = min(batch_size, df.shape[0])

    # prepare output vectors
    labels = df["label"].apply(lambda x: self.label_encoder[x]).tolist()
    output_vectors = one_hot(labels, num_classes=len(self.classes))
    if smoothing_coeff > 0.0:
        output_vectors = label_smoothing(
            output_vectors, coefficient=smoothing_coeff
        )

    # prepare input vectors
    assert len(vectorizers) > 0, "Expected at least one vectorizer"
    multi_flag = len(vectorizers) > 1
    features = df[self.__class__.FEATURE_KEY].tolist()

    input_vector_lists = []
    for _vec_func in vectorizers:
        self._info(f"Preparing {key} input vectors...")
        _input_vecs = [_vec_func(_f) for _f in tqdm(features, desc="Vectorizing")]
        input_vector_lists.append(_input_vecs)

    self._info(f"Preparing {key} data loader...")
    if multi_flag:
        assert len(input_vector_lists) > 1, "Expected multiple lists of vectors"
        loader = MultiVectorDataset(input_vector_lists, output_vectors).loader(
            batch_size=batch_size
        )
    else:
        assert len(input_vector_lists) == 1, "Expected only one list of vectors"
        input_vectors = input_vector_lists[0]
        loader = VectorDataset(input_vectors, output_vectors).loader(
            batch_size=batch_size
        )
    self._good(
        f"Prepared {key} loader with {len(features)} examples; {len(vectorizers)} vectors per feature, batch size {batch_size}"
    )
    return loader

`locate_by_feature_value(self, value, auto_recompute=True)`

Find the subset and index given a feature value.

Assumes that the value is present and detects if the subset and index found is consistent with the value.

Source code in hover/core/dataset.py

def locate_by_feature_value(self, value, auto_recompute=True):
    """
    ???+ note "Find the subset and index given a feature value."

        Assumes that the value is present and detects if the subset and index found is consistent with the value.
    """
    subset, index = self.feature_to_subset_idx[value]

    current_value = self.dfs[subset].at[index, self.__class__.FEATURE_KEY]
    if current_value != value:
        if auto_recompute:
            self._warn("locate_by_feature_value mismatch. Recomputing index.")
            self.compute_feature_index()
            # if ever need to recompute twice, there must be a bug
            return self.locate_by_feature_value(value, auto_recompute=False)
        else:
            raise ValueError("locate_by_feature_value mismatch.")
    return subset, index

`setup_dfs(self, raw_dictl, train_dictl=None, dev_dictl=None, test_dictl=None, feature_key='feature', label_key='label')`

Subroutine of the constructor that creates standard-format DataFrames.

Param	Type	Description
`raw_dictl`	`list`	list of dicts holding the to-be-supervised raw data
`train_dictl`	`list`	list of dicts holding any supervised train data
`dev_dictl`	`list`	list of dicts holding any supervised dev data
`test_dictl`	`list`	list of dicts holding any supervised test data
`feature_key`	`str`	the key for the feature in each piece of data
`label_key`	`str`	the key for the `str` label in supervised data

Source code in hover/core/dataset.py

def setup_dfs(
    self,
    raw_dictl,
    train_dictl=None,
    dev_dictl=None,
    test_dictl=None,
    feature_key="feature",
    label_key="label",
):
    """
    ???+ note "Subroutine of the constructor that creates standard-format DataFrames."

        | Param         | Type   | Description                          |
        | :------------ | :----- | :----------------------------------- |
        | `raw_dictl`   | `list` | list of dicts holding the **to-be-supervised** raw data |
        | `train_dictl` | `list` | list of dicts holding any **supervised** train data |
        | `dev_dictl`   | `list` | list of dicts holding any **supervised** dev data   |
        | `test_dictl`  | `list` | list of dicts holding any **supervised** test data  |
        | `feature_key` | `str`  | the key for the feature in each piece of data |
        | `label_key`   | `str`  | the key for the `**str**` label in supervised data |
    """

    def dictl_transform(dictl, labels=True):
        """
        Burner function to transform the input list of dictionaries into standard format.
        """
        # edge case when dictl is empty or None
        if not dictl:
            return []

        # transform the feature and possibly the label
        key_transform = {feature_key: self.__class__.FEATURE_KEY}
        if labels:
            key_transform[label_key] = "label"

        def burner(d):
            """
            Burner function to transform a single dict.
            """
            if labels:
                assert label_key in d, f"Expected dict key {label_key}"

            trans_d = {key_transform.get(_k, _k): _v for _k, _v in d.items()}

            if not labels:
                trans_d["label"] = module_config.ABSTAIN_DECODED

            return trans_d

        return [burner(_d) for _d in dictl]

    # standardize records
    dictls = {
        "raw": dictl_transform(raw_dictl, labels=False),
        "train": dictl_transform(train_dictl),
        "dev": dictl_transform(dev_dictl),
        "test": dictl_transform(test_dictl),
    }

    # initialize dataframes
    self.dfs = dict()
    for _key, _dictl in dictls.items():
        if _dictl:
            _df = pd.DataFrame(_dictl)
            assert self.__class__.FEATURE_KEY in _df.columns
            assert "label" in _df.columns
        else:
            _df = pd.DataFrame(columns=[self.__class__.FEATURE_KEY, "label"])

        self.dfs[_key] = _df

`setup_label_coding(self, verbose=True, debug=False)`

Auto-determine labels in the dataset, then create encoder/decoder in lexical order.

Add "ABSTAIN" as a no-label placeholder which gets ignored categorically.

Param	Type	Description
`verbose`	`bool`	whether to log verbosely
`debug`	`bool`	whether to enable label validation

Source code in hover/core/dataset.py

def setup_label_coding(self, verbose=True, debug=False):
    """
    ???+ note "Auto-determine labels in the dataset, then create encoder/decoder in lexical order."
        Add `"ABSTAIN"` as a no-label placeholder which gets ignored categorically.

        | Param     | Type   | Description                        |
        | :-------- | :----- | :--------------------------------- |
        | `verbose` | `bool` | whether to log verbosely           |
        | `debug`   | `bool` | whether to enable label validation |
    """
    all_labels = set()
    for _key in [*self.__class__.PUBLIC_SUBSETS, *self.__class__.PRIVATE_SUBSETS]:
        _df = self.dfs[_key]
        _found_labels = set(_df["label"].tolist())
        all_labels = all_labels.union(_found_labels)

    # exclude ABSTAIN from self.classes, but include it in the encoding
    all_labels.discard(module_config.ABSTAIN_DECODED)
    self.classes = sorted(all_labels)
    self.label_encoder = {
        **{_label: _i for _i, _label in enumerate(self.classes)},
        module_config.ABSTAIN_DECODED: module_config.ABSTAIN_ENCODED,
    }
    self.label_decoder = {_v: _k for _k, _v in self.label_encoder.items()}

    if verbose:
        self._good(
            f"Set up label encoder/decoder with {len(self.classes)} classes."
        )
    if debug:
        self.validate_labels()

`setup_pop_table(self, **kwargs)`

Set up a bokeh DataTable widget for monitoring subset data populations.

Param	Type	Description
`**kwargs`		forwarded to the `DataTable`

Source code in hover/core/dataset.py

def setup_pop_table(self, **kwargs):
    """
    ???+ note "Set up a bokeh `DataTable` widget for monitoring subset data populations."

        | Param      | Type   | Description                  |
        | :--------- | :----- | :--------------------------- |
        | `**kwargs` |        | forwarded to the `DataTable` |
    """
    subsets = [
        *self.__class__.SCRATCH_SUBSETS,
        *self.__class__.PUBLIC_SUBSETS,
        *self.__class__.PRIVATE_SUBSETS,
    ]
    pop_source = ColumnDataSource(dict())
    pop_columns = [
        TableColumn(field="label", title="label"),
        *[
            TableColumn(field=f"count_{_subset}", title=_subset)
            for _subset in subsets
        ],
        TableColumn(
            field="color",
            title="color",
            formatter=HTMLTemplateFormatter(template=COLOR_GLYPH_TEMPLATE),
        ),
    ]
    self.pop_table = DataTable(source=pop_source, columns=pop_columns, **kwargs)

    def update_population():
        """
        Callback function.
        """
        # make sure that the label coding is correct
        self.setup_label_coding()

        # re-compute label population
        eff_labels = [module_config.ABSTAIN_DECODED, *self.classes]
        color_dict = auto_label_color(self.classes)
        eff_colors = [color_dict[_label] for _label in eff_labels]

        pop_data = dict(color=eff_colors, label=eff_labels)
        for _subset in subsets:
            _subpop = self.dfs[_subset]["label"].value_counts()
            pop_data[f"count_{_subset}"] = [
                _subpop.get(_label, 0) for _label in eff_labels
            ]

        # push results to bokeh data source
        pop_source.data = pop_data

        self._good(
            f"Population updater: latest population with {len(self.classes)} classes."
        )

    update_population()
    self.dedup_trigger.on_click(update_population)

    # store the callback so that it can be referenced by other methods
    self._callback_update_population = update_population

`setup_sel_table(self, **kwargs)`

Set up a bokeh DataTable widget for viewing selected data points.

Param	Type	Description
`**kwargs`		forwarded to the `DataTable`

Source code in hover/core/dataset.py

def setup_sel_table(self, **kwargs):
    """
    ???+ note "Set up a bokeh `DataTable` widget for viewing selected data points."

        | Param      | Type   | Description                  |
        | :--------- | :----- | :--------------------------- |
        | `**kwargs` |        | forwarded to the `DataTable` |
    """

    sel_source = ColumnDataSource(dict())
    sel_columns = dataset_default_sel_table_columns(self.__class__.FEATURE_KEY)
    table_kwargs = dataset_default_sel_table_kwargs(self.__class__.FEATURE_KEY)
    table_kwargs.update(kwargs)
    self.sel_table = DataTable(
        source=sel_source, columns=sel_columns, **table_kwargs
    )

    def update_selection(selected_df):
        """
        To be triggered as a subroutine of `self.selection_viewer`.
        """
        sel_source.data = selected_df.to_dict(orient="list")
        # now that selection table has changed, clear sub-selection
        sel_source.selected.indices = []

        self._good(
            f"Selection table: latest selection with {selected_df.shape[0]} entries."
        )

    self._callback_update_selection = update_selection

    def patch_edited_selection():
        sel_source = self.sel_table.source
        raw_indices = sel_source.selected.indices
        for i in raw_indices:
            feature_value = sel_source.data[self.__class__.FEATURE_KEY][i]
            subset, idx = self.locate_by_feature_value(feature_value)
            for key in sel_source.data.keys():
                self.dfs[subset].at[idx, key] = sel_source.data[key][i]

        self._good(f"Selection table: edited {len(raw_indices)} dataset rows.")
        # if edited labels (which is common), then population has changed
        self._callback_update_population()

    self.selection_patcher.on_click(patch_edited_selection)

`setup_widgets(self)`

Create bokeh widgets for interactive data management.

Operations: - PUSH: push updated dataframes to linked explorers. - COMMIT: added selected points to a specific subset dataframe. - DEDUP: cross-deduplicate across all subset dataframes. - VIEW: view selected points of linked explorers. - the link can be different from that for PUSH. Typically all the explorers sync their selections, and only an annotator is linked to the dataset. - PATCH: update a few edited rows from VIEW result to the dataset. - EVICT: remove a few rows from both VIEW result and linked explorer selection.

Source code in hover/core/dataset.py

def setup_widgets(self):
    """
    ???+ note "Create `bokeh` widgets for interactive data management."

        Operations:
        -   PUSH: push updated dataframes to linked `explorer`s.
        -   COMMIT: added selected points to a specific subset `dataframe`.
        -   DEDUP: cross-deduplicate across all subset `dataframe`s.
        -   VIEW: view selected points of linked `explorer`s.
            -   the link can be different from that for PUSH. Typically all the `explorer`s sync their selections, and only an `annotator` is linked to the `dataset`.
        -   PATCH: update a few edited rows from VIEW result to the dataset.
        -   EVICT: remove a few rows from both VIEW result and linked `explorer` selection.
    """
    self.update_pusher = Button(
        label="Push",
        button_type="success",
    )
    self.data_committer = Dropdown(
        label="Commit",
        button_type="warning",
        menu=[*self.__class__.PUBLIC_SUBSETS, *self.__class__.PRIVATE_SUBSETS],
    )
    self.dedup_trigger = Button(
        label="Dedup",
        button_type="warning",
    )
    self.selection_viewer = Button(
        label="View Selected",
        button_type="primary",
    )
    self.selection_patcher = Button(
        label="Update Row Values",
        button_type="warning",
    )
    self.selection_evictor = Button(
        label="Evict Rows from Selection",
        button_type="primary",
    )

    self.selection_table_refresh_box = CheckboxGroup(
        labels=["auto refresh selection table"], active=[]
    )
    self.help_div = dataset_help_widget()

    def commit_base_callback():
        """
        COMMIT creates cross-duplicates between subsets.
        Changes dataset rows.
        No change to explorers.

        - PUSH shall be blocked until DEDUP is executed.
        - PATCH shall be blocked until PUSH is executed.
        - EVICT shall be blocked until PUSH is executed.
        """
        self.dedup_trigger.disabled = False
        self.update_pusher.disabled = True
        self.selection_patcher.disabled = True
        self.selection_evictor.disabled = True

    def dedup_base_callback():
        """
        DEDUP re-creates dfs with different indices than before.
        Changes dataset rows.
        No change to explorers.

        - COMMIT shall be blocked until PUSH is executed.
        - PATCH shall be blocked until PUSH is executed.
        - EVICT shall be blocked until PUSH is executed.
        """
        self.update_pusher.disabled = False
        self.data_committer.disabled = True
        self.selection_patcher.disabled = True
        self.selection_evictor.disabled = True
        self.df_deduplicate()

    def push_base_callback():
        """
        PUSH enforces df consistency with all linked explorers.
        No change to dataset rows.
        Changes explorers.

        - DEDUP could be blocked because it stays trivial until COMMIT is executed.
        """
        self.data_committer.disabled = False
        self.dedup_trigger.disabled = True
        # empty the selection table, then allow PATCH and EVICT
        self.sel_table.source.data = dict()
        self.sel_table.source.selected.indices = []
        self.selection_patcher.disabled = False
        self.selection_evictor.disabled = False

    self.update_pusher.on_click(push_base_callback)
    self.data_committer.on_click(commit_base_callback)
    self.dedup_trigger.on_click(dedup_base_callback)

`subscribe_data_commit(self, explorer, subset_mapping)`

Enable committing data across subsets, specified by a selection in an explorer and a dropdown widget of the dataset.

Param	Type	Description
`explorer`	`BokehBaseExplorer`	the explorer to register
`subset_mapping`	`dict`	`dataset` -> `explorer` subset mapping

Source code in hover/core/dataset.py

def subscribe_data_commit(self, explorer, subset_mapping):
    """
    ???+ note "Enable committing data across subsets, specified by a selection in an explorer and a dropdown widget of the dataset."
        | Param            | Type   | Description                            |
        | :--------------- | :----- | :------------------------------------- |
        | `explorer`       | `BokehBaseExplorer` | the explorer to register  |
        | `subset_mapping` | `dict` | `dataset` -> `explorer` subset mapping |
    """

    explorer.link_dataset(self)

    def callback_commit(event):
        for sub_k, sub_v in subset_mapping.items():
            sub_to = event.item
            selected_idx = explorer.sources[sub_v].selected.indices
            if not selected_idx:
                self._warn(
                    f"Attempting data commit: did not select any data points in subset {sub_v}."
                )
                return

            sel_slice = self.dfs[sub_k].iloc[selected_idx]
            valid_slice = sel_slice[
                sel_slice["label"] != module_config.ABSTAIN_DECODED
            ]

            # concat to the end and do some accounting
            size_before = self.dfs[sub_to].shape[0]
            self.dfs[sub_to] = pd.concat(
                [self.dfs[sub_to], valid_slice],
                axis=0,
                sort=False,
                ignore_index=True,
            )
            size_mid = self.dfs[sub_to].shape[0]
            self.dfs[sub_to].drop_duplicates(
                subset=[self.__class__.FEATURE_KEY], keep="last", inplace=True
            )
            size_after = self.dfs[sub_to].shape[0]

            self._info(
                f"Committed {valid_slice.shape[0]} (valid out of {sel_slice.shape[0]} selected) entries from {sub_k} to {sub_to} ({size_before} -> {size_after} with {size_mid-size_after} overwrites)."
            )
        # chain another callback
        self._callback_update_population()

    self.data_committer.on_click(callback_commit)
    self._good(
        f"Subscribed {explorer.__class__.__name__} to dataset commits: {subset_mapping}"
    )

`subscribe_selection_view(self, explorer, subsets)`

Enable viewing groups of data entries, specified by a selection in an explorer.

Param	Type	Description
`explorer`	`BokehBaseExplorer`	the explorer to register
`subsets`	`list`	subset selections to consider

Source code in hover/core/dataset.py

def subscribe_selection_view(self, explorer, subsets):
    """
    ???+ note "Enable viewing groups of data entries, specified by a selection in an explorer."
        | Param            | Type   | Description                            |
        | :--------------- | :----- | :------------------------------------- |
        | `explorer`       | `BokehBaseExplorer` | the explorer to register  |
        | `subsets`        | `list` | subset selections to consider          |
    """
    assert (
        isinstance(subsets, list) and len(subsets) > 0
    ), "Expected a non-empty list of subsets"

    explorer.link_dataset(self)

    def callback_view():
        sel_slices = []
        for subset in subsets:
            selected_idx = sorted(explorer.sources[subset].selected.indices)
            sub_slice = explorer.dfs[subset].iloc[selected_idx]
            sel_slices.append(sub_slice)

        selected = pd.concat(sel_slices, axis=0)
        self._callback_update_selection(selected)

    def callback_view_refresh():
        if 0 in self.selection_table_refresh_box.active:
            callback_view()

    def callback_evict():
        # create sets for fast index discarding
        subset_to_indicies = {}
        for subset in subsets:
            indicies = set(explorer.sources[subset].selected.indices)
            subset_to_indicies[subset] = indicies

        # from datatable index, get feature values to look up dataframe index
        sel_source = self.sel_table.source
        raw_indicies = sel_source.selected.indices
        for i in raw_indicies:
            feature_value = sel_source.data[self.__class__.FEATURE_KEY][i]
            subset, idx = self.locate_by_feature_value(feature_value)
            subset_to_indicies[subset].discard(idx)

        # assign indices back to change actual selection
        for subset in subsets:
            indicies = sorted(list(subset_to_indicies[subset]))
            explorer.sources[subset].selected.indices = indicies

        self._good(
            f"Selection table: evicted {len(raw_indicies)} points from selection."
        )
        # refresh the selection table
        callback_view()

    explorer._register_selection_callback("read", callback_view_refresh)
    self.selection_viewer.on_click(callback_view)
    self.selection_evictor.on_click(callback_evict)
    self._good(
        f"Subscribed {explorer.__class__.__name__} to selection table: {subsets}"
    )

`subscribe_update_push(self, explorer, subset_mapping)`

Enable pushing updated DataFrames to explorers that depend on them.

Param	Type	Description
`explorer`	`BokehBaseExplorer`	the explorer to register
`subset_mapping`	`dict`	`dataset` -> `explorer` subset mapping

Note: the reason we need this is due to self.dfs[key] = ...-like assignments. If DF operations were all in-place, then the explorers could directly access the updates through their self.dfs references.

Source code in hover/core/dataset.py

def subscribe_update_push(self, explorer, subset_mapping):
    """
    ???+ note "Enable pushing updated DataFrames to explorers that depend on them."
        | Param            | Type   | Description                            |
        | :--------------- | :----- | :------------------------------------- |
        | `explorer`       | `BokehBaseExplorer` | the explorer to register  |
        | `subset_mapping` | `dict` | `dataset` -> `explorer` subset mapping |

        Note: the reason we need this is due to `self.dfs[key] = ...`-like assignments. If DF operations were all in-place, then the explorers could directly access the updates through their `self.dfs` references.
    """

    explorer.link_dataset(self)

    def callback_push():
        df_dict = {_v: self.dfs[_k] for _k, _v in subset_mapping.items()}
        explorer._setup_dfs(df_dict)
        explorer._update_sources()

    self.update_pusher.on_click(callback_push)
    self._good(
        f"Subscribed {explorer.__class__.__name__} to dataset pushes: {subset_mapping}"
    )

`to_pandas(self)`

Export to a pandas DataFrame.

Source code in hover/core/dataset.py

def to_pandas(self):
    """
    ???+ note "Export to a pandas DataFrame."
    """
    dfs = []
    for _subset in ["raw", "train", "dev", "test"]:
        _df = self.dfs[_subset].copy()
        _df[DATASET_SUBSET_FIELD] = _subset
        dfs.append(_df)

    return pd.concat(dfs, axis=0)

`validate_labels(self, raise_exception=True)`

Assert that every label is in the encoder.

Param	Type	Description
`raise_exception`	`bool`	whether to raise errors when failed

Source code in hover/core/dataset.py

def validate_labels(self, raise_exception=True):
    """
    ???+ note "Assert that every label is in the encoder."

        | Param             | Type   | Description                         |
        | :---------------- | :----- | :---------------------------------- |
        | `raise_exception` | `bool` | whether to raise errors when failed |
    """
    for _key in [*self.__class__.PUBLIC_SUBSETS, *self.__class__.PRIVATE_SUBSETS]:
        _invalid_indices = None
        assert "label" in self.dfs[_key].columns
        _mask = self.dfs[_key]["label"].apply(
            lambda x: int(x in self.label_encoder)
        )
        # DO NOT change the "==" to "is"; False in pandas is not False below
        _invalid_indices = np.where(_mask == 0)[0].tolist()
        if _invalid_indices:
            self._fail(f"Subset {_key} has invalid labels:")
            self._print(self.dfs[_key].loc[_invalid_indices])
            if raise_exception:
                raise ValueError("invalid labels")

`view(self)`

Defines the layout of bokeh objects when visualized.

Source code in hover/core/dataset.py

def view(self):
    """
    ???+ note "Defines the layout of `bokeh` objects when visualized."
    """
    # local import to avoid naming confusion/conflicts
    from bokeh.layouts import row, column

    return column(
        self.help_div,
        # population table and directly associated widgets
        row(
            self.update_pusher,
            self.data_committer,
            self.dedup_trigger,
            self.file_exporter,
        ),
        self.pop_table,
        # selection table and directly associated widgets
        row(
            self.selection_viewer,
            self.selection_patcher,
            self.selection_evictor,
        ),
        row(
            self.selection_table_refresh_box,
        ),
        self.sel_table,
    )

`SupervisableImageDataset (SupervisableDataset)`

SupervisableDataset whose primary feature is image.

Source code in hover/core/dataset.py

class SupervisableImageDataset(SupervisableDataset):
    """
    ???+ note "`SupervisableDataset` whose primary feature is `image`."
    """

    FEATURE_KEY = "image"

`SupervisableTextDataset (SupervisableDataset)`

SupervisableDataset whose primary feature is text.

Source code in hover/core/dataset.py

class SupervisableTextDataset(SupervisableDataset):
    """
    ???+ note "`SupervisableDataset` whose primary feature is `text`."
    """

    FEATURE_KEY = "text"

hover.core.dataset

SupervisableAudioDataset (SupervisableDataset)

SupervisableDataset (Loggable)

__init__(self, *args, **kwargs) special

compute_2d_embedding(self, vectorizer, method=None, **kwargs)

compute_feature_index(self)

compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs)

copy(self)

df_deduplicate(self)

from_pandas(df, **kwargs) classmethod

loader(self, key, *vectorizers, *, batch_size=64, smoothing_coeff=0.0)

locate_by_feature_value(self, value, auto_recompute=True)

setup_dfs(self, raw_dictl, train_dictl=None, dev_dictl=None, test_dictl=None, feature_key='feature', label_key='label')

setup_label_coding(self, verbose=True, debug=False)

setup_pop_table(self, **kwargs)

setup_sel_table(self, **kwargs)

setup_widgets(self)

subscribe_data_commit(self, explorer, subset_mapping)

subscribe_selection_view(self, explorer, subsets)

subscribe_update_push(self, explorer, subset_mapping)

to_pandas(self)

validate_labels(self, raise_exception=True)

view(self)

SupervisableImageDataset (SupervisableDataset)

SupervisableTextDataset (SupervisableDataset)

`SupervisableAudioDataset (SupervisableDataset)`

`SupervisableDataset (Loggable)`

`init(self, *args, **kwargs)` `special`

`compute_2d_embedding(self, vectorizer, method=None, **kwargs)`

`compute_feature_index(self)`

`compute_nd_embedding(self, vectorizer, method=None, dimension=2, **kwargs)`

`copy(self)`

`df_deduplicate(self)`

`from_pandas(df, **kwargs)` `classmethod`

`loader(self, key, vectorizers, , batch_size=64, smoothing_coeff=0.0)`

`locate_by_feature_value(self, value, auto_recompute=True)`

`setup_dfs(self, raw_dictl, train_dictl=None, dev_dictl=None, test_dictl=None, feature_key='feature', label_key='label')`

`setup_label_coding(self, verbose=True, debug=False)`

`setup_pop_table(self, **kwargs)`

`setup_sel_table(self, **kwargs)`

`setup_widgets(self)`

`subscribe_data_commit(self, explorer, subset_mapping)`

`subscribe_selection_view(self, explorer, subsets)`

`subscribe_update_push(self, explorer, subset_mapping)`

`to_pandas(self)`

`validate_labels(self, raise_exception=True)`

`view(self)`

`SupervisableImageDataset (SupervisableDataset)`

`SupervisableTextDataset (SupervisableDataset)`