Skip to content

Noise Handling

NOISE_HANDLING

Source code in engines/contentFilterEngine/miscellaneous_techniques/noise_handling.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
class NOISE_HANDLING:
    def __init__(self, method='isolation_forest', contamination=0.1):
        """
        Initialize noise handling.

        Args:
            method (str): Noise detection method ('isolation_forest', 'zscore', 'iqr')
            contamination (float): Expected proportion of outliers in the dataset
        """
        self.method = method
        self.contamination = contamination
        self.outlier_detector = None
        self.scaler = RobustScaler()

    def fit_transform(self, X):
        """
        Detect and handle noisy samples in the data.

        Args:
            X: Input data
        """
        if self.method == 'isolation_forest':
            self.outlier_detector = IsolationForest(contamination=self.contamination)
            is_inlier = self.outlier_detector.fit_predict(X) == 1
            return X[is_inlier], is_inlier

        elif self.method == 'zscore':
            X_scaled = self.scaler.fit_transform(X)
            z_scores = np.abs(X_scaled)
            is_inlier = np.all(z_scores < 3, axis=1)
            return X[is_inlier], is_inlier

        elif self.method == 'iqr':
            Q1 = np.percentile(X, 25, axis=0)
            Q3 = np.percentile(X, 75, axis=0)
            IQR = Q3 - Q1
            is_inlier = np.all((X > (Q1 - 1.5 * IQR)) & (X < (Q3 + 1.5 * IQR)), axis=1)
            return X[is_inlier], is_inlier

    def transform(self, X):
        """Apply noise detection to new data."""
        if self.method == 'isolation_forest':
            if self.outlier_detector is None:
                raise ValueError("Fit the detector first using fit_transform()")
            return X[self.outlier_detector.predict(X) == 1]
        else:
            return self.fit_transform(X)[0]

__init__(method='isolation_forest', contamination=0.1)

Initialize noise handling.

Parameters:

Name Type Description Default
method str

Noise detection method ('isolation_forest', 'zscore', 'iqr')

'isolation_forest'
contamination float

Expected proportion of outliers in the dataset

0.1
Source code in engines/contentFilterEngine/miscellaneous_techniques/noise_handling.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
def __init__(self, method='isolation_forest', contamination=0.1):
    """
    Initialize noise handling.

    Args:
        method (str): Noise detection method ('isolation_forest', 'zscore', 'iqr')
        contamination (float): Expected proportion of outliers in the dataset
    """
    self.method = method
    self.contamination = contamination
    self.outlier_detector = None
    self.scaler = RobustScaler()

fit_transform(X)

Detect and handle noisy samples in the data.

Parameters:

Name Type Description Default
X

Input data

required
Source code in engines/contentFilterEngine/miscellaneous_techniques/noise_handling.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def fit_transform(self, X):
    """
    Detect and handle noisy samples in the data.

    Args:
        X: Input data
    """
    if self.method == 'isolation_forest':
        self.outlier_detector = IsolationForest(contamination=self.contamination)
        is_inlier = self.outlier_detector.fit_predict(X) == 1
        return X[is_inlier], is_inlier

    elif self.method == 'zscore':
        X_scaled = self.scaler.fit_transform(X)
        z_scores = np.abs(X_scaled)
        is_inlier = np.all(z_scores < 3, axis=1)
        return X[is_inlier], is_inlier

    elif self.method == 'iqr':
        Q1 = np.percentile(X, 25, axis=0)
        Q3 = np.percentile(X, 75, axis=0)
        IQR = Q3 - Q1
        is_inlier = np.all((X > (Q1 - 1.5 * IQR)) & (X < (Q3 + 1.5 * IQR)), axis=1)
        return X[is_inlier], is_inlier

transform(X)

Apply noise detection to new data.

Source code in engines/contentFilterEngine/miscellaneous_techniques/noise_handling.py
45
46
47
48
49
50
51
52
def transform(self, X):
    """Apply noise detection to new data."""
    if self.method == 'isolation_forest':
        if self.outlier_detector is None:
            raise ValueError("Fit the detector first using fit_transform()")
        return X[self.outlier_detector.predict(X) == 1]
    else:
        return self.fit_transform(X)[0]