.feature

Intermediate classes based on the main feature.

`BokehForAudio (BokehForUrlToVector)`

BokehForUrlToVector with audio (path like "http://" or "file:///") as the main feature.

Assumes on top of its parent class:

in supplied dataframes
(always) audio urls in an audio column

Does not assume:

what the explorer serves to do.

Source code in hover/core/explorer/feature.py

class BokehForAudio(BokehForUrlToVector):
    """
    ???+ note "`BokehForUrlToVector` with `audio` (path like `"http://"` or `"file:///"`) as the main feature."
        Assumes on top of its parent class:

        - in supplied dataframes
          - (always) audio urls in an `audio` column

        Does not assume:

        - what the explorer serves to do.
    """

    PRIMARY_FEATURE = "audio"
    MANDATORY_COLUMNS = [PRIMARY_FEATURE, "label"]
    TOOLTIP_KWARGS = {
        "label": {"label": "Label"},
        "audio": {"audio": ""},
        "coords": True,
        "index": True,
    }

`BokehForImage (BokehForUrlToVector)`

BokehForUrlToVector with image (path like "http://" or "file:///") as the main feature.

Assumes on top of its parent class:

in supplied dataframes
(always) image urls in an image column

Does not assume:

what the explorer serves to do.

Source code in hover/core/explorer/feature.py

class BokehForImage(BokehForUrlToVector):
    """
    ???+ note "`BokehForUrlToVector` with `image` (path like `"http://"` or `"file:///"`) as the main feature."
        Assumes on top of its parent class:

        - in supplied dataframes
          - (always) image urls in an `image` column

        Does not assume:

        - what the explorer serves to do.
    """

    PRIMARY_FEATURE = "image"
    MANDATORY_COLUMNS = [PRIMARY_FEATURE, "label"]
    TOOLTIP_KWARGS = {
        "label": {"label": "Label"},
        "image": {"image": hover.config["visual"]["tooltip_img_style"]},
        "coords": True,
        "index": True,
    }

`BokehForText (BokehBaseExplorer)`

BokehBaseExplorer with text (str) as the main feature.

Assumes on top of its parent class:

in supplied dataframes
(always) text data in a text column

Does not assume:

what the explorer serves to do.

Source code in hover/core/explorer/feature.py

class BokehForText(BokehBaseExplorer):
    """
    ???+ note "`BokehBaseExplorer` with `text` (`str`) as the main feature."
        Assumes on top of its parent class:

        - in supplied dataframes
          - (always) text data in a `text` column

        Does not assume:

        - what the explorer serves to do.
    """

    PRIMARY_FEATURE = "text"
    MANDATORY_COLUMNS = [PRIMARY_FEATURE, "label"]
    TOOLTIP_KWARGS = {
        "label": {"label": "Label"},
        "text": {"text": "Text"},
        "coords": True,
        "index": True,
    }

    def _setup_search_widgets(self):
        """
        ???+ note "Create positive/negative text search boxes."
        """
        common_kwargs = dict(width_policy="fit", height_policy="fit")
        pos_title, neg_title = "Text contains (python regex):", "Text does not contain:"
        self.search_pos = TextInput(title=pos_title, **common_kwargs)
        self.search_neg = TextInput(title=neg_title, **common_kwargs)

    def _search_watch_widgets(self):
        return [self.search_pos, self.search_neg]

    def _validate_search_input(self):
        """
        ???+ note "Text uses regex search, for which any string can be considered valid."
        """
        return True

    def _get_search_score_function(self):
        """
        ???+ note "Dynamically create a single-argument scoring function."
        """
        pos_regex, neg_regex = self.search_pos.value, self.search_neg.value

        def regex_score(text):
            score = 0
            if len(pos_regex) > 0:
                score += 1 if re.search(pos_regex, text) else -2
            if len(neg_regex) > 0:
                score += -2 if re.search(neg_regex, text) else 1
            return score

        return regex_score

`BokehForUrlToVector (BokehBaseExplorer)`

A layer of abstraction for BokehBaseExplorer subclasses whose feature-type-specific mechanisms work the same way through vectors.

Source code in hover/core/explorer/feature.py

class BokehForUrlToVector(BokehBaseExplorer):
    """
    ???+ note "A layer of abstraction for `BokehBaseExplorer` subclasses whose feature-type-specific mechanisms work the same way through vectors."
    """

    def _setup_search_widgets(self):
        """
        ???+ note "Create similarity search widgets."
        """
        self.search_sim = TextInput(
            title=f"{self.__class__.PRIMARY_FEATURE} similarity search (enter URL)".capitalize(),
            width_policy="fit",
            height_policy="fit",
        )
        self.search_threshold = Slider(
            start=0.0,
            end=1.0,
            value=0.9,
            # fewer steps allowed because refreshing search result can be expensive
            step=0.1,
            title="Similarity threshold",
        )

    def _search_watch_widgets(self):
        return [self.search_sim, self.search_threshold]

    def _subroutine_search_create_callbacks(self):
        """
        ???+ note "Create search callback functions based on feature attributes."
        """
        # determine cache size for normalized vectorizer
        num_points = sum([_df.shape[0] for _df in self.dfs.values()])
        cache_size = min(num_points, int(1e5))

        # find vectorizer
        assert hasattr(self, "linked_dataset"), "need linked_dataset for its vectorizer"
        found_vectorizers = self.linked_dataset.vectorizer_lookup.values()
        assert len(found_vectorizers) > 0, "dataset has no known vectorizer"

        raw_vectorizer = list(found_vectorizers)[0]

        # gain speed up by caching and normalization
        @lru_cache(maxsize=cache_size)
        def normalized_vectorizer(feature):
            try:
                vec = raw_vectorizer(feature)
            except Exception as e:
                self._warn(f"vectorizer crashed: {e}; assigning None as vector.")
                return None
            norm = np.linalg.norm(vec)
            return vec / (norm + 1e-16)

        self._dynamic_resources["normalized_vectorizer"] = normalized_vectorizer

        super()._subroutine_search_create_callbacks()

    def _get_search_score_function(self):
        """
        ???+ note "Dynamically create a single-argument scoring function."
        """
        vectorizer = self._dynamic_resources["normalized_vectorizer"]
        url_query = self.search_sim.value
        sim_thresh = self.search_threshold.value
        vec_query = vectorizer(url_query)

        def cosine_based_score(url_doc):
            # edge case: query or doc is invalid for vectorization
            vec_doc = vectorizer(url_doc)
            if vec_query is None or vec_doc is None:
                return 0

            # common case: query and doc are both valid
            query_doc_sim = (np.dot(vec_query, vec_doc) + 1.0) / 2.0
            if query_doc_sim >= sim_thresh:
                return 1
            else:
                return -1

        return cosine_based_score

    def _validate_search_input(self):
        """
        ???+ note "Must be some url pointing to a suffixed file."

            For speed, avoid sending web requests in this validation step.
        """
        from urllib.parse import urlparse
        from pathlib import Path

        url_query = self.search_sim.value
        file_path = Path(urlparse(url_query).path)
        return bool(file_path.suffix)