Skip to content

Feature Extraction

FeatureExtraction

Source code in engines/contentFilterEngine/performance_scalability/feature_extraction.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
class FeatureExtraction:
    def __init__(self, max_features=5000):
        """
        Initializes the FeatureExtraction with a TF-IDF vectorizer.

        Parameters:
        - max_features (int): The maximum number of features (vocabulary size).
        """
        self.max_features = max_features
        self.lemmatizer = WordNetLemmatizer()
        self.vectorizer = TfidfVectorizer(
            max_features=self.max_features,
            stop_words='english',  # Use built-in stop words
            tokenizer=self.tokenize
        )
        logger.info(f"FeatureExtraction initialized with max_features={self.max_features}.")

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenizes and lemmatizes the input text.

        Parameters:
        - text (str): The text to tokenize.

        Returns:
        - list: A list of processed tokens.
        """
        tokens = nltk.word_tokenize(text.lower())
        lemmatized = [
            self.lemmatizer.lemmatize(token)
            for token in tokens
            if token.isalpha()
        ]
        logger.debug(f"Tokenized text: {lemmatized}")
        return lemmatized

    def fit_transform(self, documents: List[str]):
        """
        Fits the TF-IDF vectorizer on the documents and transforms them into feature vectors.

        Parameters:
        - documents (list of str): The list of documents to process.

        Returns:
        - sparse matrix: The TF-IDF feature matrix.
        """
        logger.info("Fitting and transforming documents into TF-IDF features.")
        return self.vectorizer.fit_transform(documents)

    def transform(self, documents: List[str]) -> Any:
        """
        Transforms the documents into TF-IDF feature vectors using the already fitted vectorizer.

        Parameters:
        - documents (list of str): The list of documents to transform.

        Returns:
        - sparse matrix: The TF-IDF feature matrix.
        """
        logger.info("Transforming documents into LSA latent space.")
        tfidf_matrix = self.vectorizer.transform(documents)  # Use transform, not fit
        return self.lsa_model.transform(tfidf_matrix)

    def get_feature_names(self) -> List[str]:
        """
        Retrieves the feature names (vocabulary) from the vectorizer.

        Returns:
        - list: A list of feature names.
        """
        return self.vectorizer.get_feature_names_out()

__init__(max_features=5000)

Initializes the FeatureExtraction with a TF-IDF vectorizer.

Parameters: - max_features (int): The maximum number of features (vocabulary size).

Source code in engines/contentFilterEngine/performance_scalability/feature_extraction.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def __init__(self, max_features=5000):
    """
    Initializes the FeatureExtraction with a TF-IDF vectorizer.

    Parameters:
    - max_features (int): The maximum number of features (vocabulary size).
    """
    self.max_features = max_features
    self.lemmatizer = WordNetLemmatizer()
    self.vectorizer = TfidfVectorizer(
        max_features=self.max_features,
        stop_words='english',  # Use built-in stop words
        tokenizer=self.tokenize
    )
    logger.info(f"FeatureExtraction initialized with max_features={self.max_features}.")

fit_transform(documents)

Fits the TF-IDF vectorizer on the documents and transforms them into feature vectors.

Parameters: - documents (list of str): The list of documents to process.

Returns: - sparse matrix: The TF-IDF feature matrix.

Source code in engines/contentFilterEngine/performance_scalability/feature_extraction.py
52
53
54
55
56
57
58
59
60
61
62
63
def fit_transform(self, documents: List[str]):
    """
    Fits the TF-IDF vectorizer on the documents and transforms them into feature vectors.

    Parameters:
    - documents (list of str): The list of documents to process.

    Returns:
    - sparse matrix: The TF-IDF feature matrix.
    """
    logger.info("Fitting and transforming documents into TF-IDF features.")
    return self.vectorizer.fit_transform(documents)

get_feature_names()

Retrieves the feature names (vocabulary) from the vectorizer.

Returns: - list: A list of feature names.

Source code in engines/contentFilterEngine/performance_scalability/feature_extraction.py
79
80
81
82
83
84
85
86
def get_feature_names(self) -> List[str]:
    """
    Retrieves the feature names (vocabulary) from the vectorizer.

    Returns:
    - list: A list of feature names.
    """
    return self.vectorizer.get_feature_names_out()

tokenize(text)

Tokenizes and lemmatizes the input text.

Parameters: - text (str): The text to tokenize.

Returns: - list: A list of processed tokens.

Source code in engines/contentFilterEngine/performance_scalability/feature_extraction.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def tokenize(self, text: str) -> List[str]:
    """
    Tokenizes and lemmatizes the input text.

    Parameters:
    - text (str): The text to tokenize.

    Returns:
    - list: A list of processed tokens.
    """
    tokens = nltk.word_tokenize(text.lower())
    lemmatized = [
        self.lemmatizer.lemmatize(token)
        for token in tokens
        if token.isalpha()
    ]
    logger.debug(f"Tokenized text: {lemmatized}")
    return lemmatized

transform(documents)

Transforms the documents into TF-IDF feature vectors using the already fitted vectorizer.

Parameters: - documents (list of str): The list of documents to transform.

Returns: - sparse matrix: The TF-IDF feature matrix.

Source code in engines/contentFilterEngine/performance_scalability/feature_extraction.py
65
66
67
68
69
70
71
72
73
74
75
76
77
def transform(self, documents: List[str]) -> Any:
    """
    Transforms the documents into TF-IDF feature vectors using the already fitted vectorizer.

    Parameters:
    - documents (list of str): The list of documents to transform.

    Returns:
    - sparse matrix: The TF-IDF feature matrix.
    """
    logger.info("Transforming documents into LSA latent space.")
    tfidf_matrix = self.vectorizer.transform(documents)  # Use transform, not fit
    return self.lsa_model.transform(tfidf_matrix)