Skip to content

.feature

  • Intermediate classes based on the main feature.

    BokehForAudio (BokehForUrlToVector)

    BokehForUrlToVector with audio (path like "http://" or "file:///") as the main feature.

    Assumes on top of its parent class:

    • in supplied dataframes
    • (always) audio urls in an audio column

    Does not assume:

    • what the explorer serves to do.
    Source code in hover/core/explorer/feature.py
    class BokehForAudio(BokehForUrlToVector):
        """
        ???+ note "`BokehForUrlToVector` with `audio` (path like `"http://"` or `"file:///"`) as the main feature."
            Assumes on top of its parent class:
    
            - in supplied dataframes
              - (always) audio urls in an `audio` column
    
            Does not assume:
    
            - what the explorer serves to do.
        """
    
        PRIMARY_FEATURE = "audio"
        MANDATORY_COLUMNS = [PRIMARY_FEATURE, "label"]
        TOOLTIP_KWARGS = {
            "label": {"label": "Label"},
            "audio": {"audio": ""},
            "coords": True,
            "index": True,
        }
    

    BokehForImage (BokehForUrlToVector)

    BokehForUrlToVector with image (path like "http://" or "file:///") as the main feature.

    Assumes on top of its parent class:

    • in supplied dataframes
    • (always) image urls in an image column

    Does not assume:

    • what the explorer serves to do.
    Source code in hover/core/explorer/feature.py
    class BokehForImage(BokehForUrlToVector):
        """
        ???+ note "`BokehForUrlToVector` with `image` (path like `"http://"` or `"file:///"`) as the main feature."
            Assumes on top of its parent class:
    
            - in supplied dataframes
              - (always) image urls in an `image` column
    
            Does not assume:
    
            - what the explorer serves to do.
        """
    
        PRIMARY_FEATURE = "image"
        MANDATORY_COLUMNS = [PRIMARY_FEATURE, "label"]
        TOOLTIP_KWARGS = {
            "label": {"label": "Label"},
            "image": {"image": hover.config["visual"]["tooltip_img_style"]},
            "coords": True,
            "index": True,
        }
    

    BokehForText (BokehBaseExplorer)

    BokehBaseExplorer with text (str) as the main feature.

    Assumes on top of its parent class:

    • in supplied dataframes
    • (always) text data in a text column

    Does not assume:

    • what the explorer serves to do.
    Source code in hover/core/explorer/feature.py
    class BokehForText(BokehBaseExplorer):
        """
        ???+ note "`BokehBaseExplorer` with `text` (`str`) as the main feature."
            Assumes on top of its parent class:
    
            - in supplied dataframes
              - (always) text data in a `text` column
    
            Does not assume:
    
            - what the explorer serves to do.
        """
    
        PRIMARY_FEATURE = "text"
        MANDATORY_COLUMNS = [PRIMARY_FEATURE, "label"]
        TOOLTIP_KWARGS = {
            "label": {"label": "Label"},
            "text": {"text": "Text"},
            "coords": True,
            "index": True,
        }
    
        def _setup_search_widgets(self):
            """
            ???+ note "Create positive/negative text search boxes."
            """
            common_kwargs = dict(width_policy="fit", height_policy="fit")
            pos_title, neg_title = "Text contains (python regex):", "Text does not contain:"
            self.search_pos = TextInput(title=pos_title, **common_kwargs)
            self.search_neg = TextInput(title=neg_title, **common_kwargs)
    
        def _search_watch_widgets(self):
            return [self.search_pos, self.search_neg]
    
        def _validate_search_input(self):
            """
            ???+ note "Text uses regex search, for which any string can be considered valid."
            """
            return True
    
        def _get_search_score_function(self):
            """
            ???+ note "Dynamically create a single-argument scoring function."
            """
            pos_regex, neg_regex = self.search_pos.value, self.search_neg.value
    
            def regex_score(text):
                score = 0
                if len(pos_regex) > 0:
                    score += 1 if re.search(pos_regex, text) else -2
                if len(neg_regex) > 0:
                    score += -2 if re.search(neg_regex, text) else 1
                return score
    
            return regex_score
    

    BokehForUrlToVector (BokehBaseExplorer)

    A layer of abstraction for BokehBaseExplorer subclasses whose feature-type-specific mechanisms work the same way through vectors.
    Source code in hover/core/explorer/feature.py
    class BokehForUrlToVector(BokehBaseExplorer):
        """
        ???+ note "A layer of abstraction for `BokehBaseExplorer` subclasses whose feature-type-specific mechanisms work the same way through vectors."
        """
    
        def _setup_search_widgets(self):
            """
            ???+ note "Create similarity search widgets."
            """
            self.search_sim = TextInput(
                title=f"{self.__class__.PRIMARY_FEATURE} similarity search (enter URL)".capitalize(),
                width_policy="fit",
                height_policy="fit",
            )
            self.search_threshold = Slider(
                start=0.0,
                end=1.0,
                value=0.9,
                # fewer steps allowed because refreshing search result can be expensive
                step=0.1,
                title="Similarity threshold",
            )
    
        def _search_watch_widgets(self):
            return [self.search_sim, self.search_threshold]
    
        def _subroutine_search_create_callbacks(self):
            """
            ???+ note "Create search callback functions based on feature attributes."
            """
            # determine cache size for normalized vectorizer
            num_points = sum([_df.shape[0] for _df in self.dfs.values()])
            cache_size = min(num_points, int(1e5))
    
            # find vectorizer
            assert hasattr(self, "linked_dataset"), "need linked_dataset for its vectorizer"
            found_vectorizers = self.linked_dataset.vectorizer_lookup.values()
            assert len(found_vectorizers) > 0, "dataset has no known vectorizer"
    
            raw_vectorizer = list(found_vectorizers)[0]
    
            # gain speed up by caching and normalization
            @lru_cache(maxsize=cache_size)
            def normalized_vectorizer(feature):
                try:
                    vec = raw_vectorizer(feature)
                except Exception as e:
                    self._warn(f"vectorizer crashed: {e}; assigning None as vector.")
                    return None
                norm = np.linalg.norm(vec)
                return vec / (norm + 1e-16)
    
            self._dynamic_resources["normalized_vectorizer"] = normalized_vectorizer
    
            super()._subroutine_search_create_callbacks()
    
        def _get_search_score_function(self):
            """
            ???+ note "Dynamically create a single-argument scoring function."
            """
            vectorizer = self._dynamic_resources["normalized_vectorizer"]
            url_query = self.search_sim.value
            sim_thresh = self.search_threshold.value
            vec_query = vectorizer(url_query)
    
            def cosine_based_score(url_doc):
                # edge case: query or doc is invalid for vectorization
                vec_doc = vectorizer(url_doc)
                if vec_query is None or vec_doc is None:
                    return 0
    
                # common case: query and doc are both valid
                query_doc_sim = (np.dot(vec_query, vec_doc) + 1.0) / 2.0
                if query_doc_sim >= sim_thresh:
                    return 1
                else:
                    return -1
    
            return cosine_based_score
    
        def _validate_search_input(self):
            """
            ???+ note "Must be some url pointing to a suffixed file."
    
                For speed, avoid sending web requests in this validation step.
            """
            from urllib.parse import urlparse
            from pathlib import Path
    
            url_query = self.search_sim.value
            file_path = Path(urlparse(url_query).path)
            return bool(file_path.suffix)