Skip to content

LDA

LDA

Source code in engines/contentFilterEngine/probabilistic_statistical_methods/lda.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
class LDA:
    def __init__(self, n_components: int = 10, max_iter: int = 10):
        """
        Initialize the LDA model with the specified number of topics.

        Parameters:
        - n_components (int): Number of topics.
        - max_iter (int): Maximum number of iterations for the EM algorithm.
        """
        self.vectorizer = CountVectorizer(stop_words='english')
        self.lda_model = LatentDirichletAllocation(n_components=n_components, max_iter=max_iter, random_state=42)
        logger.info(f"LDA initialized with {n_components} topics and {max_iter} max iterations.")

    def fit(self, documents: List[str]):
        """
        Fit the LDA model on the provided documents.

        Parameters:
        - documents (List[str]): List of documents to train the model.
        """
        logger.info("Fitting LDA model on documents.")
        count_matrix = self.vectorizer.fit_transform(documents)
        self.lda_model.fit(count_matrix)
        logger.info("LDA model training completed.")

    def transform(self, documents: List[str]) -> Any:
        """
        Transform documents into the LDA topic space.

        Parameters:
        - documents (List[str]): List of documents to transform.

        Returns:
        - Transformed document matrix in topic space.
        """
        logger.info("Transforming documents into LDA topic space.")
        count_matrix = self.vectorizer.transform(documents)
        return self.lda_model.transform(count_matrix)

    def recommend(self, query: str, top_n: int = 10) -> List[int]:
        """
        Recommend items based on the similarity of the query to the topics.

        Parameters:
        - query (str): The query text for which to generate recommendations.
        - top_n (int): Number of top recommendations to return.

        Returns:
        - List[int]: List of recommended item indices.
        """
        logger.info("Generating recommendations using LDA.")
        query_vec = self.transform([query])
        topic_distribution = self.lda_model.transform(self.vectorizer.transform([query]))
        similarity_scores = (topic_distribution @ self.lda_model.components_.T).flatten()
        top_indices = similarity_scores.argsort()[::-1][:top_n]
        logger.info(f"Top {top_n} recommendations generated using LDA.")
        return top_indices.tolist()

__init__(n_components=10, max_iter=10)

Initialize the LDA model with the specified number of topics.

Parameters: - n_components (int): Number of topics. - max_iter (int): Maximum number of iterations for the EM algorithm.

Source code in engines/contentFilterEngine/probabilistic_statistical_methods/lda.py
11
12
13
14
15
16
17
18
19
20
21
def __init__(self, n_components: int = 10, max_iter: int = 10):
    """
    Initialize the LDA model with the specified number of topics.

    Parameters:
    - n_components (int): Number of topics.
    - max_iter (int): Maximum number of iterations for the EM algorithm.
    """
    self.vectorizer = CountVectorizer(stop_words='english')
    self.lda_model = LatentDirichletAllocation(n_components=n_components, max_iter=max_iter, random_state=42)
    logger.info(f"LDA initialized with {n_components} topics and {max_iter} max iterations.")

fit(documents)

Fit the LDA model on the provided documents.

Parameters: - documents (List[str]): List of documents to train the model.

Source code in engines/contentFilterEngine/probabilistic_statistical_methods/lda.py
23
24
25
26
27
28
29
30
31
32
33
def fit(self, documents: List[str]):
    """
    Fit the LDA model on the provided documents.

    Parameters:
    - documents (List[str]): List of documents to train the model.
    """
    logger.info("Fitting LDA model on documents.")
    count_matrix = self.vectorizer.fit_transform(documents)
    self.lda_model.fit(count_matrix)
    logger.info("LDA model training completed.")

recommend(query, top_n=10)

Recommend items based on the similarity of the query to the topics.

Parameters: - query (str): The query text for which to generate recommendations. - top_n (int): Number of top recommendations to return.

Returns: - List[int]: List of recommended item indices.

Source code in engines/contentFilterEngine/probabilistic_statistical_methods/lda.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def recommend(self, query: str, top_n: int = 10) -> List[int]:
    """
    Recommend items based on the similarity of the query to the topics.

    Parameters:
    - query (str): The query text for which to generate recommendations.
    - top_n (int): Number of top recommendations to return.

    Returns:
    - List[int]: List of recommended item indices.
    """
    logger.info("Generating recommendations using LDA.")
    query_vec = self.transform([query])
    topic_distribution = self.lda_model.transform(self.vectorizer.transform([query]))
    similarity_scores = (topic_distribution @ self.lda_model.components_.T).flatten()
    top_indices = similarity_scores.argsort()[::-1][:top_n]
    logger.info(f"Top {top_n} recommendations generated using LDA.")
    return top_indices.tolist()

transform(documents)

Transform documents into the LDA topic space.

Parameters: - documents (List[str]): List of documents to transform.

Returns: - Transformed document matrix in topic space.

Source code in engines/contentFilterEngine/probabilistic_statistical_methods/lda.py
35
36
37
38
39
40
41
42
43
44
45
46
47
def transform(self, documents: List[str]) -> Any:
    """
    Transform documents into the LDA topic space.

    Parameters:
    - documents (List[str]): List of documents to transform.

    Returns:
    - Transformed document matrix in topic space.
    """
    logger.info("Transforming documents into LDA topic space.")
    count_matrix = self.vectorizer.transform(documents)
    return self.lda_model.transform(count_matrix)