MalwareClassify/model.py at main · Spajed/MalwareClassify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import tensorflow as tf
import matplotlib.pyplot as plt


class MalwareDetectionNN:
    def __init__(self, num_attributes):
        """
        Constructor for Malware Detection Neural Network model
        """
        self.__history = None

        self.__model = tf.keras.Sequential()
        self.__model.add(tf.keras.Input(shape=(num_attributes, )))

        self.__model.add(tf.keras.layers.Dense(1800, activation='relu'))
        self.__model.add(tf.keras.layers.BatchNormalization()),
        self.__model.add(tf.keras.layers.Dropout(0.2))
        self.__model.add(tf.keras.layers.Dense(1200, activation='relu'))
        self.__model.add(tf.keras.layers.BatchNormalization()),
        self.__model.add(tf.keras.layers.Dropout(0.2))
        self.__model.add(tf.keras.layers.Dense(750, activation='relu'))
        self.__model.add(tf.keras.layers.BatchNormalization()),
        self.__model.add(tf.keras.layers.Dropout(0.2))
        self.__model.add(tf.keras.layers.Dense(300, activation='relu'))

        # last layer requires softmax activation function since BinaryCrossentropy loss function's `from_logits`
        # attribute is set to False
        self.__model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    def summary(self):
        """
        Output overview of model structure and parameters

        :return:
        """
        self.__model.summary()

    def __compile(self, learning_rate: float = 0.001, optimizer: tf.keras.optimizers = tf.keras.optimizers.Adam,
                  loss_function: tf.keras.losses = tf.keras.losses.BinaryCrossentropy):
        """
        Private method for compiling the model to be used within the training and testing methods of the
        MalwareDetection model instances

        :param learning_rate: Learning rate specified for the initialization of the optimizer
        :param optimizer: Type of optimizer to be used to best suite the model training
        :param loss_function: Type of loss function to best represent the desired output prediction type
        :return:
        """
        self.__model.compile(optimizer=optimizer(learning_rate),
                             loss=loss_function(from_logits=True),
                             metrics=['accuracy'])

    def train(self, train_dataset: tf.data.Dataset, validation_dataset: tf.data.Dataset = None, epochs: int = 25,
              learning_rate: float = 0.001, optimizer: tf.keras.optimizers = tf.keras.optimizers.Adam,
              loss_function: tf.keras.losses = tf.keras.losses.BinaryCrossentropy):
        """
        Training function for Malware Detection Neural Network model

        :param train_dataset: Dataset used to train the malware detection model
        :param validation_dataset: Dataset for validation of the malware prediction model accuracy during training
        :param epochs: number of epochs for training the detection model
        :param learning_rate: Learning rate specified for the initialization of the optimizer
        :param optimizer: Type of optimizer to be used to best suite the model training
        :param loss_function: Type of loss function to best represent the desired output prediction type
        :return:
        """
        callbacks = [tf.keras.callbacks.ModelCheckpoint("Checkpoints/Save_at_{epoch}.h5"), ]

        self.__compile(learning_rate, optimizer, loss_function)
        if validation_dataset is None:
            self.__history = self.__model.fit(train_dataset,
                                              epochs=epochs,
                                              callbacks=callbacks)
        else:
            self.__history = self.__model.fit(train_dataset,
                                              validation_data=validation_dataset,
                                              epochs=epochs,
                                              callbacks=callbacks)

        self.__plot_training_performance()

    def __plot_training_performance(self):
        """
        Private function to be run after training in order to visualize the accuracy behaviour of the training on the
        train and validation subsets of training dataset

        :return:
        """
        plt.plot(self.__history.history['accuracy'], label='training accuracy')
        # plt.plot(self.__history.history['val_accuracy'], label='validation accuracy')
        plt.xlabel("Epoch")
        plt.ylabel('Accuracy')
        plt.ylim([0.0, 1.0])
        plt.legend(loc='lower right')
        plt.show()

    def load_checkpoint(self, checkpoint_file_path, learning_rate: float = 0.007, optimizer: tf.keras.optimizers = tf.keras.optimizers.Adam, loss_function: tf.keras.losses = tf.keras.losses.BinaryCrossentropy):
        """
        Load model from saved '.h5' model checkpoint file path

        :param checkpoint_file_path:
        :return:
        """
        self.__model.load_weights(checkpoint_file_path)
        self.__compile(learning_rate, optimizer, loss_function)

    def test(self, test_dataset: tf.data.Dataset):
        """
        Testing function for Malware Detection Neural Network model

        :param test_dataset: data set used to evaluate the performance of the models generalization on data samples not
        seen during training
        :return:
        """
        return self.__model.evaluate(test_dataset, verbose=1)