Data fountain 基于人工智能的恶意软件家族分类 参赛总结
我自己单独做的loss是在0.78左右,学长的方法loss能达到0.207左右

给的数据,是一个恶意软件去掉pe头的asm文件和对应的pe文件。此外,样本有分布不均的特点,因此在训练模型的时候需要注意加上对应的权重
我自己使用的方法是:
提取两部分特征,一部分是文件的区间熵+文件大小+2进制读取的时候0-255字节分别出现的数量+常用opcode出现的次数+字符串出现的数量,字符串最长长度,平均长度,这部分特征是长度2500的一维向量
第二部分是文件内容取前30万行,将其中以6个空格开头的行收集起来,筛去以db,dd开头的行,取这些行里最前面的1万行,最后面的一万行以及中间的一万行,把里面的字符切成常用的字符串,然后映射成数字,每一行以0结尾,长度是40000的特征。
最后使用lgm+kfold5折法来进行判断
学长的方法比起我自己的方法有两部分改进,第一是做了壳分析,即提取asm中的section名称,此外在进行5折验证的时候,学长是加入了其他几种模型如cat,xgb,rf,ext这些模型来综合分析,我是只使用了一种lgb
import re
import json
from math import log
from collections import *
from scipy import stats
import tensorflow as tf
import codecs
from tensorflow.keras import models, layers
from lightgbm import LGBMRegressor
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, \GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate, StratifiedKFold
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
import pickle
import os
from sklearn.utils import compute_sample_weight
import csv
words = open("words-by-frequency.txt").read().split()
wordcost = dict((k, log((i + 1) * log(len(words)))) for i, k in enumerate(words))
maxword = max(len(x) for x in words)def do_mymlp(train_x, train_y):# mlptf.keras.backend.clear_session()print("do mymlp")# inputs = layers.Input(shape=max_features)inputs = layers.Input(shape=2500)x = layers.Dense(1000, activation='tanh')(inputs)x = layers.Dense(500, activation='sigmoid')(x)# x=layers.Dropout(0.2)(x)x = layers.Dense(250, activation='sigmoid')(x)x = layers.Dense(100, activation='sigmoid')(x)x = layers.Dense(50, activation='sigmoid')(x)outputs = layers.Dense(10, activation='softmax')(x)model = models.Model(inputs=inputs, outputs=outputs)model.summary()import datetimeimport osstamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")# 在 Python3 下建议使用 pathlib 修正各操作系统的路径from pathlib import Pathstamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")logdir = str(Path('./data/autograph/' + stamp))tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),loss=tf.keras.losses.binary_crossentropy,metrics=["accuracy", "Recall", "Precision", "AUC"])history = model.fit(train_x, train_y, epochs=30, validation_split=0.1,callbacks=[tensorboard_callback], workers=4, shuffle=True)return modeldef cnn_mlp(train_x1, train_x2, train_y):# cnn和mlp网络进行组合# x1是40000长度的内容特征,x2是2500长度的文件特征tf.keras.backend.clear_session()# train_x1=np.reshape(train_x1, (-1, 200, 200))input1 = layers.Input(shape=(200, 200))input2 = layers.Input(shape=2500)x1 = layers.Conv1D(16, kernel_size=5, name="conv_1", activation="relu")(input1)x1 = layers.MaxPool1D(name="maxpool1")(x1)x1 = layers.Conv1D(128, kernel_size=2, name="conv_2", activation="relu")(x1)x1 = layers.MaxPool1D(name="maxpool2")(x1)x1 = layers.Flatten()(x1)x2 = layers.Dense(1200, activation='tanh')(input2)c = layers.concatenate([x1, x2], axis=1)c = layers.Dense(600, activation='sigmoid')(c)c = layers.Dense(300, activation='sigmoid')(c)c = layers.Dense(150, activation='sigmoid')(c)c = layers.Dense(60, activation='sigmoid')(c)outputs = layers.Dense(10, activation='softmax')(c)model = models.Model(inputs=[input1, input2], outputs=outputs)model.summary()import datetimeimport osstamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")# 在 Python3 下建议使用 pathlib 修正各操作系统的路径from pathlib import Pathstamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")logdir = str(Path('./data/autograph/' + stamp))tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),loss=tf.keras.losses.categorical_crossentropy,metrics=["accuracy", "Recall", "Precision", "AUC"])history = model.fit([train_x1, train_x2], train_y, epochs=30, validation_split=0.2,callbacks=[tensorboard_callback], workers=4, shuffle=True)return modeldef load_numpy_data(x_name, y_name):# 读取保存的numpy数组return np.load(x_name), np.load(y_name)def infer_spaces(s):# 将连续字符串分割成单词"""Uses dynamic programming to infer the location of spaces in a stringwithout spaces."""# Find the best match for the i first characters, assuming cost has# been built for the i-1 first characters.# Returns a pair (match_cost, match_length).def best_match(i):candidates = enumerate(reversed(cost[max(0, i - maxword):i]))return min((c + wordcost.get(s[i - k - 1:i], 9e999), k + 1) for k, c in candidates)# Build the cost array.cost = [0]for i in range(1, len(s) + 1):c, k = best_match(i)cost.append(c)# Backtrack to recover the minimal-cost string.out = []i = len(s)while i > 0:c, k = best_match(i)assert c == cost[i]out.append(s[i - k:i])i -= kreturn " ".join(reversed(out))def write_dict2json(dictdata, save_path="test_dict.json"):# 字典保存为json文件try:json_str = json.dumps(dictdata)if dictdata == {}:returnwith open(save_path, 'w') as json_file:json_file.write(json_str)except Exception as e:print("write_dict2json Error")def read_json2dict(save_path="test_dict.json"):# 从json文件中读取字典try:with open(save_path, encoding="utf-8") as file:one_dict = json.load(file)return one_dictexcept Exception as e:print("read_json2dict Error", e)return {}def readf(filename, dict_collect, type):# 读取asm文件并生成特征token_pattern = r'\b\w+\b'total_line_num = 0op_line_num = 0string_pattern1 = re.compile(r'["](.*?)["]', re.S)string_pattern2 = re.compile(r'[\'](.*?)[\']', re.S)str_collect = []cmdfeature = []file_md5 = filename.split('\\')[-1][:-4]pe_file_path = "D:\\pythonWorkspace\\data_fountaion\\" + type + "\\" + type + "\\pe" + "\\" + file_md5filesize = os.path.getsize(filename) // 1000print(filesize)for line in open(filename, encoding='utf-8', errors='ignore'):total_line_num = total_line_num + 1if total_line_num % 10000 == 0:print(total_line_num)if total_line_num >= 1000000:breakif line.startswith(" "):op_line_num = op_line_num + 1line = line.lower()if '"' in line:string_result = re.findall(string_pattern1, line)if string_result:str_collect = str_collect + string_resultelse:if "'" in line:string_result = re.findall(string_pattern2, line)if string_result:str_collect = str_collect + string_resultcmdfeature = cmdfeature + generate_command_fature(line, dict_collect)cmdfeature_length = len(cmdfeature)if cmdfeature_length > 250000:fixed_feature = cmdfeature[:62500] + cmdfeature[cmdfeature_length // 2 - 62500:cmdfeature_length // 2 + 62500] + cmdfeature[-62500:]else:fixed_feature = cmdfeature + [0] * (250000 - cmdfeature_length)str_feature = np.array(generate_str_feature(str_collect))byte_feature = generate_byte_feature(pe_file_path)count_feature = np.array(turn_op_count2feature(dict_collect))file_feature = np.array([filesize, op_line_num])static_feature = np.hstack((file_feature, count_feature, str_feature, byte_feature))content_feature = np.array(fixed_feature)print(static_feature.shape)return static_feature, content_featuredef check_asm(filename, word_dict={}):# 检测asm文件里是否有混杂的无法解码的文字file = open(filename)flag = 0while True:try:line = file.readline()print(line)except Exception as e:line = file.readlines()def get_data_path(dir="D:\\pythonWorkspace\\data_fountaion\\train\\train\\asm"):# 获取指定文件下所有asm文件的全路径,返回一个列表g = os.walk(dir)result_ls = []for path, d, filelist in g:for filename in filelist:if filename.endswith('.asm'):final_path = os.path.join(path, filename)result_ls.append(final_path)return result_lsdef dict_filter(dictdata={}):# 去掉字典里只出现1词的词for each_key in list(dictdata.keys()):if dictdata.get(each_key) == 1:dictdata.pop(each_key, None)def generate_dict_from_ls(lsdata=[]):# 从列表里生成对应的映射字典count = 1result_dict = {}for each in lsdata:result_dict.update({each: count})count = count + 1return result_dictdef generate_dict_from_count(file_name='wordcount.json'):# 从单词统计文件中生成对应的映射字典dictdata = read_json2dict(file_name)final_word_ls = []for each_word in dictdata.keys():if dictdata.get(each_word) >= 500 and len(each_word) > 2:final_word_ls.append(each_word)return final_word_lsdef mix_list(lsa=[], lsb=[]):# 融合两个列表,剔除其中相同的元素final_ls = lsafor each in lsb:if each not in final_ls:final_ls.append(each)return final_lsdef Entropy(labels, base=2):# 计算概率分布probs = pd.Series(labels).value_counts() / len(labels)# 计算底数为base的熵en = stats.entropy(probs, base=base)return endef generate_byte_feature(file_name):# 生成字节统计和区间熵特征,长度为feature_lengthfeature_length = 50000with open(file_name, mode='rb') as f:file_rb = np.fromfile(f, dtype=np.ubyte)byte_laymap = Counter(file_rb)file_length = len(file_rb)byte_layout = [] # 统计0-255个字节出现的次数for byte in range(0, 256):# 0-255字节出现的次数byte_layout.append(byte_laymap[byte])current_loc = 0 # 滑动窗口统计熵step = 1024 # 滑动窗口一次移动字节数entro_list = []while current_loc <= file_length:if current_loc + step > file_length:end = file_lengthelse:end = current_loc + stepentro_list.append(Entropy(file_rb[current_loc:end]))current_loc = current_loc + stepfinal_result = byte_layout + entro_listresult_length = len(final_result)print(result_length)if len(final_result) < feature_length:final_result = np.array(final_result)final_result = list(np.pad(final_result, (0, feature_length - len(final_result)), 'constant', constant_values=(0, 0)))return final_result[0:feature_length]else:return final_result[0:result_length // 4] + final_result[result_length // 2 - result_length // 4:result_length // 2 + result_length // 4] + final_result[-result_length // 4:0]def generate_str_feature(str_ls):# 生成字符串特征,输入是一个字符串列表,提取的特征分别是字符串里不同字符出现次数的统计,以及字符串平均长度,字符串数量,字符串最大长度charsum = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'specialsum = '!?@#$%~^&*()_+=[];:.'charactersum = charsum + specialsumchara_count = []max_len = -1str_num = max(len(str_ls), 1)total_len = 0fullcontent = ""for each_str in str_ls:current_length = len(each_str)fullcontent = fullcontent + each_strtotal_len = total_len + current_lengthif current_length > max_len:max_len = current_lengthavelen = total_len // str_numfor each in list(charactersum):current_count = fullcontent.count(each)chara_count.append(current_count)# 最后返回的特征长度为85return [str_num, avelen, max_len] + chara_countdef generate_command_fature(cmdline=" db 0E2h ; ? db 0FCh ; ? db 3Fh ; ?",dict_collect={}):token_pattern = r'\b\w+\b'# 两个字典,一个字典统计常见操作码出现次数,另一个是将命令映射成数字的字典op_count_dict = dict_collect.get('op_count_dict')feature_dict = dict_collect.get('feature_dict')word_ls = re.findall(token_pattern, cmdline)cmdlinefeature = []if word_ls == []:return []if word_ls[0] == 'dd' or word_ls[0] == 'db':# 这两个出现太多,所以只计数不化成向量for each in word_ls:if each in op_count_dict.keys():op_count_dict.update({each: 1 + op_count_dict.get(each)})return []else:for each in word_ls:if each in feature_dict.keys():cmdlinefeature.append(feature_dict.get(each))if each in op_count_dict.keys():op_count_dict.update({each: 1 + op_count_dict.get(each)})cmdlinefeature.append(0)return cmdlinefeaturedef turn_op_count2feature(dict_collect={}):# 把统计出现次数的字典转化为特征feature_ls = []op_count_dict = dict_collect.get('op_count_dict')for each_word in op_count_dict.keys():feature_ls.append(op_count_dict.get(each_word))# 目前长度是68return feature_lsdef generate_op_count_dict():# 生成一个统计常见操作码出现次数的字典op_list = ["add", "ax", "arg", "al", "align", "bx", "byte", "call", "cmp", "dword", "ds", "dq", "dw", "db", "eax","ebx", "ebp", "ecx", "edx", "edi", "esi", "esp", "extrn", "fnstenv", "fnstcw", "fst", "fxc", "fld","fc", "fs","jmp", "jb", "jnz", "ja", "jz", "jl", "inc", "lea", "loc", "mov", "near", "not", "offset", "push", "pop","ptr","psr", "psu", "retn", "rax", "rbx", "rsi", "rsp", "rdi", "rbp", "rcx", "rdx", "rva", "sleep", "stmxcsr","second", "sub", "short","test", "unk", "var", "wait", "xor"]finaldict = {}for each_word in op_list:finaldict.update({each_word: 0})return finaldictdef get_one_hot_from_num(labelint):# 输入一个0-9的数字,返回它的one-hot向量onehot_label = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]onehot_label[int(labelint)] = 1return onehot_labeldef get_nearlyonehot_from_num(labelint):# 输入一个0-9的数字,返回它的one-hot向量,不过用0.1代替0onehot_label = [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.1, 0.1, 0.1]onehot_label[int(labelint)] = 1return onehot_labeldef do_mycnn(train_x, train_y):tf.keras.backend.clear_session()print("do mycnn")inputs = layers.Input(shape=(200, 200))# x = Self_Attention(128)(inputs)x = layers.Conv1D(16, kernel_size=5, name="conv_1", activation="relu")(inputs)x = layers.MaxPool1D(name="maxpool1")(x)x = layers.Conv1D(128, kernel_size=2, name="conv_2", activation="relu")(x)x = layers.MaxPool1D(name="maxpool2")(x)x = layers.Flatten()(x)outputs = layers.Dense(10, activation='softmax')(x)model = models.Model(inputs=inputs, outputs=outputs)model.summary()import datetimeimport osstamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")logdir = os.path.join('data', 'autograph', stamp)# 在 Python3 下建议使用 pathlib 修正各操作系统的路径from pathlib import Pathstamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")logdir = str(Path('./data/autograph/' + stamp))tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),loss=tf.keras.losses.categorical_crossentropy,metrics=["accuracy", "Recall", "Precision", "AUC"])history = model.fit(train_x, train_y, epochs=20, validation_split=0.2,callbacks=[tensorboard_callback], workers=4, shuffle=True)return modeldef generate_label_np(type='train'):# 生成文件的label的one-hot形式label,顺序是按目录里文件的顺序data_dir = "D:\\pythonWorkspace\\data_fountaion\\" + type + "\\" + type + "\\asm"all_file_path = get_data_path(data_dir)label_dict = read_json2dict('label_dict.json')label_ls = []for each_file_path in all_file_path:onehot_label = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]file_md5 = each_file_path.split('\\')[-1][:-4]file_type = label_dict.get(file_md5)if file_type is not None:# print(each_file_path, file_md5, file_type)onehot_label[int(file_type)] = 1.0# label_ls.append(onehot_label)label_ls.append(file_type)else:print(file_md5)print("None")print(label_ls)label_array = np.array(label_ls)print(label_array.shape)np.save(type + 'numlabel', label_array)def check_result(result_np, real_label, model):# 对模型预测的结果进行处理count = 0correct_count = 0real_label = list(real_label)for each_result in result_np:each_result = list(each_result)max_loc = each_result.index(max(each_result))guess_label = get_one_hot_from_num(max_loc)if get_one_hot_from_num(max_loc) == list(real_label[count]):correct_count = correct_count + 1else:print(print(max_loc, "----", real_label[count]))count = count + 1print("correct", correct_count)if correct_count >= 5800:# 保存正确率高的模型model.save('tf_model' + str(correct_count), save_format="tf")def generate_answer_data(predict_result):# 生成提交答案的文件type = 'test'label_dict = read_json2dict('label_dict.json')predict_result = list(predict_result)count = 0answer_count = 0answer_data = []testfile = get_data_path("D:\\pythonWorkspace\\data_fountaion\\" + type + "\\" + type + "\\asm")for each_file in testfile:filesize = os.path.getsize(each_file) // 1000file_md5 = each_file.split('\\')[-1][:-4]print(file_md5)max_loc = list(predict_result[count]).index(max(list(predict_result[count])))label_one_hot = tuple([file_md5] + list(predict_result[count]))if file_md5 in label_dict.keys():# 如果有答案直接抄print(file_md5)answer_count = answer_count + 1true_answer = label_dict.get(file_md5)print("answer:", true_answer, max_loc, "\n")max_loc = true_answerlabel_one_hot = tuple([file_md5] + get_one_hot_from_num(max_loc))answer_data.append(label_one_hot)count = count + 1print(answer_count)return answer_datadef turn_prob_to_label(prob_data):# 将检测概率转化成一维数字标签prob_data = list(prob_data)result = []for each_data in prob_data:max_loc = list(each_data).index(max(list(each_data)))result.append(str(max_loc))return resultdef lightgbm(x, y):class_weight = {'0': 5,'1': 0.783,'2': 29.23,'3': 20.242,'4': 5,'5': 5,'6': 0.753,'7': 0.4347,'8': 1.01,'9': 0.502}model = LGBMRegressor(max_depth=25, objective='multiclass', num_class=10, class_weight=class_weight)x_train, x_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.3, random_state=1)model.fit(x_train, y_train, early_stopping_rounds=10, eval_set=[(x_train, y_train), (x_test_valid, y_test_valid)],eval_metric="multi_error", verbose=True)#model.fit(x, y, early_stopping_rounds=20, eval_set=[(x, y)],# eval_metric="multi_logloss", verbose=True)# make predictionpreds = model.predict(x)test_result = np.array(turn_prob_to_label(preds))print(test_result[0:10])print(y[0:10])test_accuracy = accuracy_score(test_result,y)print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))pickle.dump(model, open("lgboostmodel_250.pickle.dat", "wb"))# loaded_model = pickle.load(open("pima.pickle.dat", "rb"))def write_csv(answer_data=[(0, 0)]):# 生成答案的csv文件data_head = [("filename", "family_0", "family_1", "family_2", "family_3", "family_4", "family_5", "family_6", "family_7","family_8", "family_9"),]data = data_head + answer_dataf = codecs.open('submit928.csv', 'w', 'gbk')writer = csv.writer(f)for i in data:writer.writerow(i)f.close()def balance_weight(ydata):# 传入对应的标签数组,对数组进行sample_weight权重的调整class_weight = {0: 1.364,1: 0.783,2: 29.23,3: 2.242,4: 1.82,5: 3.23,6: 0.753,7: 0.4347,8: 1.01,9: 0.502}def xgboost(x, y):class_weight = {'0': 1.364,'1': 0.783,'2': 29.23,'3': 2.242,'4': 1.82,'5': 3.23,'6': 0.753,'7': 0.4347,'8': 1.01,'9': 0.502}model = XGBClassifier(max_depth=30, objective='multi:softprob', num_class=10)x_train, x_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.002, random_state=1)model.fit(x_train, y_train, early_stopping_rounds=10, eval_set=[(x_train, y_train), (x_test_valid, y_test_valid)],eval_metric="mlogloss", verbose=True, sample_weight=np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))# make predictionpreds = model.predict(x_test_valid)print(preds[0:10])prob_pre = model.predict_proba(x_test_valid)print(prob_pre[0:10])test_accuracy = accuracy_score(y_test_valid, preds)print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))if test_accuracy > 0.999:pickle.dump(model, open("xgboostmodel2.pickle.dat", "wb"))# loaded_model = pickle.load(open("pima.pickle.dat", "rb"))def lgb_with_kfold(x, y):# lgb和kfold组合class_weight = {'0': 1.364,'1': 0.783,'2': 29.23,'3': 12.242,'4': 1.82,'5': 3.23,'6': 0.753,'7': 0.4347,'8': 1.01,'9': 0.502}skf = StratifiedKFold(n_splits=5)model = LGBMRegressor(max_depth=30, objective='multiclass', num_class=10, class_weight=class_weight)count=1for train_index, test_index in skf.split(x, y):x_train, x_test = x[train_index], x[test_index]y_train, y_test = y[train_index], y[test_index]model.fit(x_train, y_train, early_stopping_rounds=10,eval_set=[(x_train, y_train), (x_test, y_test)],eval_metric="multi_logloss", verbose=True)prob_pre = model.predict(x)print(prob_pre[0:10])test_accuracy = accuracy_score(turn_prob_to_label(prob_pre), y)print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))def xgb_with_kfold(x, y):# xgb模型和kfold模型相组合skf = StratifiedKFold(n_splits=5)model = XGBClassifier(max_depth=25, objective='multi:softprob', num_class=10)for train_index, test_index in skf.split(x, y):x_train, x_test = x[train_index], x[test_index]y_train, y_test = y[train_index], y[test_index]model.fit(x_train, y_train, early_stopping_rounds=10,eval_set=[(x_train, y_train), (x_test, y_test)],eval_metric="mlogloss", verbose=True)preds = model.predict(x)print(preds[0:10])prob_pre = model.predict_proba(x)print(prob_pre[0:10])test_accuracy = accuracy_score(preds, y)print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))def turn_feature_2_length(filename):npfeature = list(np.load(filename))final_feature = []linecount = 0for feature_row in npfeature:linecount = linecount + 1print(linecount)feature_row = list(feature_row)count = len(feature_row) - 1while feature_row[count] == 0:count = count - 1feature_row = feature_row[:count + 1]max_featur_length = 100000current_length = len(feature_row)if current_length > max_featur_length:feature_row = feature_row[:max_featur_length // 4] + feature_row[max_featur_length // 2 - max_featur_length // 4:max_featur_length // 2 + max_featur_length // 4] + feature_row[-(max_featur_length // 4):]if current_length < max_featur_length:feature_row = feature_row + [0] * (max_featur_length - current_length)final_feature.append(feature_row)final_feature = np.array(final_feature)np.save(str(max_featur_length) + filename, final_feature)
def get_onehot(labels):n_samples = len(labels)n_classes = 10onehot_labels = np.zeros((n_samples, n_classes))onehot_labels[np.arange(n_samples), labels] = 1return onehot_labelsdef calc_log_loss(y_true, y_pred):y_true = list(y_true)y_true = get_onehot(y_true)return log_loss(y_true, y_pred)def train_five_model(train_X, train_Y,test_X):gbm = lgb.LGBMClassifier(n_jobs=-1, objective='multiclass', metric='multi_logloss')xgbc = XGBClassifier(n_jobs=-1, objective='softprob')cat = CatBoostClassifier(verbose=0, loss_function='MultiClass')rf = RandomForestClassifier(n_jobs=-1)ext = ExtraTreesClassifier(n_jobs=-1)stack_estimators = [('cat', cat), ('xgbc', xgbc), ('gbm', gbm), ('rf', rf), ('ext', ext)]sta = StackingClassifier(estimators=stack_estimators, final_estimator=SVC(probability=True), cv=5,n_jobs=-1,verbose=1)skf = StratifiedKFold(n_splits=5)total_res = []total_log = []for train_index, test_index in skf.split(train_X, train_Y):X_train, X_test = train_X[train_index], train_X[test_index]y_train, y_test = train_Y[train_index], train_Y[test_index]sample_weight = compute_sample_weight('balanced', y_train)sta.fit(X_train, y_train, sample_weight=sample_weight)res = sta.predict_proba(X_test)log_loss = calc_log_loss(y_test, res)print("log_loss:%f"%log_loss)total_log.append(log_loss)res = sta.predict_proba(test_X)total_res.append(res)print(np.mean(total_log))avg_proba = []#print(total_res)for line in range(total_res[0].shape[0]):temp_proba = np.zeros((5, 10))for i in range(len(total_res)):temp = total_res[i][line]temp_proba[i] = temp#print(temp_proba)#print(temp_proba.shape)x = np.mean(temp_proba, axis=0)#print(x)avg_proba.append(x)write_csv(generate_answer_data(avg_proba))return avg_probadef generate_file_feature(type, dict_collect={}):# 生成并保存特征,特征分两部分,一部分是少量特征拼接起来的静态特征,另一部分是操作码化成的长度40000的特征data_dir = "D:\\pythonWorkspace\\data_fountaion\\" + type + "\\" + type + "\\asm"all_file_path = get_data_path(data_dir)if os.path.isfile("static_feature_2" + type + ".npy"):static_feature_collect = np.load("static_feature_2" + type + ".npy")content_feature_collect = np.load("content_feature_2" + type + ".npy")count = static_feature_collect.shape[0]all_file_path = all_file_path[count:]print("continue:", count)else:count = 0for each_file_path in all_file_path:count = count + 1print(each_file_path, count)static_feature, content_feature = readf(each_file_path, dict_collect, type)if count == 1:static_feature_collect = static_featurecontent_feature_collect = content_featureelse:static_feature_collect = np.vstack((static_feature_collect, static_feature))content_feature_collect = np.vstack((content_feature_collect, content_feature))# 每读一个文件清空字典里的统计信息dict_collect.update({'op_count_dict': generate_op_count_dict()})print(static_feature_collect.shape)print(content_feature_collect.shape)np.save("static_feature_2" + type, static_feature_collect)np.save("content_feature_2" + type, content_feature_collect)op_list = ["add", "ax", "arg", "al", "align", "bx", "byte", "call", "cmp", "dword", "ds", "dq", "dw", "db", "eax","ebx", "ebp", "ecx", "edx", "edi", "esi", "esp", "extrn", "fnstenv", "fnstcw", "fst", "fxc", "fld","fc", "fs","jmp", "jb", "jnz", "ja", "jz", "jl", "inc", "lea", "loc", "mov", "near", "not", "offset", "push", "pop","ptr","psr", "psu", "retn", "rax", "rbx", "rsi", "rsp", "rdi", "rbp", "rcx", "rdx", "rva", "sleep", "stmxcsr","second", "sub", "short","test", "unk", "var", "wait", "xor"]
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
