Skip to content

Feature Selection

FEATURE_SELECTION

Source code in engines/contentFilterEngine/miscellaneous_techniques/feature_selection.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
class FEATURE_SELECTION:
    def __init__(self, k=10, method='chi2'):
        """
        Initialize feature selection.

        Args:
            k (int): Number of top features to select
            method (str): Feature selection method ('chi2', 'variance', 'correlation')
        """
        self.k = k
        self.method = method
        self.selected_features = None
        self.feature_scores = None
        self.scaler = MinMaxScaler()

    def fit_transform(self, X, y=None):
        """
        Fit the feature selector and transform the data.

        Args:
            X: Input features
            y: Target variables (optional for some methods)
        """
        if self.method == 'chi2':
            # Scale features to non-negative for chi2
            X_scaled = self.scaler.fit_transform(X)
            selector = SelectKBest(chi2, k=self.k)
            X_selected = selector.fit_transform(X_scaled, y)
            self.feature_scores = selector.scores_
            self.selected_features = selector.get_support()
            return X_selected

        elif self.method == 'variance':
            # Select features based on variance
            variances = np.var(X, axis=0)
            top_k_idx = np.argsort(variances)[-self.k:]
            self.selected_features = np.zeros(X.shape[1], dtype=bool)
            self.selected_features[top_k_idx] = True
            self.feature_scores = variances
            return X[:, top_k_idx]

        elif self.method == 'correlation':
            # Select features based on correlation with target
            correlations = np.array([np.corrcoef(X[:, i], y)[0, 1] for i in range(X.shape[1])])
            top_k_idx = np.argsort(np.abs(correlations))[-self.k:]
            self.selected_features = np.zeros(X.shape[1], dtype=bool)
            self.selected_features[top_k_idx] = True
            self.feature_scores = correlations
            return X[:, top_k_idx]

    def transform(self, X):
        """Transform new data using selected features."""
        if self.selected_features is None:
            raise ValueError("Fit the selector first using fit_transform()")
        return X[:, self.selected_features]

    def get_feature_importance(self):
        """Return feature importance scores."""
        if self.feature_scores is None:
            raise ValueError("Fit the selector first using fit_transform()")
        return self.feature_scores

__init__(k=10, method='chi2')

Initialize feature selection.

Parameters:

Name Type Description Default
k int

Number of top features to select

10
method str

Feature selection method ('chi2', 'variance', 'correlation')

'chi2'
Source code in engines/contentFilterEngine/miscellaneous_techniques/feature_selection.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
def __init__(self, k=10, method='chi2'):
    """
    Initialize feature selection.

    Args:
        k (int): Number of top features to select
        method (str): Feature selection method ('chi2', 'variance', 'correlation')
    """
    self.k = k
    self.method = method
    self.selected_features = None
    self.feature_scores = None
    self.scaler = MinMaxScaler()

fit_transform(X, y=None)

Fit the feature selector and transform the data.

Parameters:

Name Type Description Default
X

Input features

required
y

Target variables (optional for some methods)

None
Source code in engines/contentFilterEngine/miscellaneous_techniques/feature_selection.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def fit_transform(self, X, y=None):
    """
    Fit the feature selector and transform the data.

    Args:
        X: Input features
        y: Target variables (optional for some methods)
    """
    if self.method == 'chi2':
        # Scale features to non-negative for chi2
        X_scaled = self.scaler.fit_transform(X)
        selector = SelectKBest(chi2, k=self.k)
        X_selected = selector.fit_transform(X_scaled, y)
        self.feature_scores = selector.scores_
        self.selected_features = selector.get_support()
        return X_selected

    elif self.method == 'variance':
        # Select features based on variance
        variances = np.var(X, axis=0)
        top_k_idx = np.argsort(variances)[-self.k:]
        self.selected_features = np.zeros(X.shape[1], dtype=bool)
        self.selected_features[top_k_idx] = True
        self.feature_scores = variances
        return X[:, top_k_idx]

    elif self.method == 'correlation':
        # Select features based on correlation with target
        correlations = np.array([np.corrcoef(X[:, i], y)[0, 1] for i in range(X.shape[1])])
        top_k_idx = np.argsort(np.abs(correlations))[-self.k:]
        self.selected_features = np.zeros(X.shape[1], dtype=bool)
        self.selected_features[top_k_idx] = True
        self.feature_scores = correlations
        return X[:, top_k_idx]

get_feature_importance()

Return feature importance scores.

Source code in engines/contentFilterEngine/miscellaneous_techniques/feature_selection.py
62
63
64
65
66
def get_feature_importance(self):
    """Return feature importance scores."""
    if self.feature_scores is None:
        raise ValueError("Fit the selector first using fit_transform()")
    return self.feature_scores

transform(X)

Transform new data using selected features.

Source code in engines/contentFilterEngine/miscellaneous_techniques/feature_selection.py
56
57
58
59
60
def transform(self, X):
    """Transform new data using selected features."""
    if self.selected_features is None:
        raise ValueError("Fit the selector first using fit_transform()")
    return X[:, self.selected_features]