xtract-sampler/predict.py at master · xtracthub/xtract-sampler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
import json
import os
from headbytes import HeadBytes
from extpredict import FileReader, SystemReader
from randbytes import RandBytes
from randhead import RandHead


def predict_single_file(filename, trained_classifier, feature, head_bytes=512, rand_bytes=512):
    """Predicts the type of file.

    filename (str): Name of file to predict the type of.
    trained_classifier: (sklearn model): Trained model.
    feature (str): Type of feature that trained_classifier was trained on.
    """

    with open('CLASS_TABLE.json', 'r') as f:
        label_map = json.load(f)
        f.close()
    if feature == "head":
        features = HeadBytes(head_size=head_bytes)
    elif feature == "randhead":
        features = RandHead(head_size=head_bytes, rand_size=rand_bytes)
    elif feature == "rand":
        features = RandBytes(number_bytes=rand_bytes)
    else:
        raise Exception("Not a valid feature set. ")

    reader = FileReader(feature_maker=features, filename=filename)
    reader.run()

    data = [line for line in reader.data][2]
    x = np.array([int.from_bytes(c, byteorder="big") for c in data])
    x = [x]

    prediction = trained_classifier.predict(x)

    label = (list(label_map.keys())[list(label_map.values()).index(int(prediction[0]))])
    return label


def predict_directory(dir_name, trained_classifier, feature, head_bytes=512, rand_bytes=512):
    file_predictions = {}

    with open('CLASS_TABLE.json', 'r') as f:
        label_map = json.load(f)
        f.close()
    if feature == "head":
        features = HeadBytes(head_size=head_bytes)
    elif feature == "randhead":
        features = RandHead(head_size=head_bytes,
                            rand_size=rand_bytes)
    elif feature == "rand":
        features = RandBytes(number_bytes=rand_bytes)
    else:
        raise Exception("Not a valid feature set. ")
    reader = SystemReader(feature_maker=features, top_dir=dir_name)
    reader.run()
    for file_data in reader.data:

        data = [line for line in file_data][2]

        x = np.array([int.from_bytes(c, byteorder="big") for c in data])
        x = [x]

        prediction = trained_classifier.predict(x)
        label = (list(label_map.keys())[list(label_map.values()).index(int(prediction[0]))])
        file_predictions[os.path.join(file_data[0], file_data[1])] = label

    return file_predictions