CatBoost是一种基于梯度提升决策树(Gradient Boosting Decision Tree,GBDT)的机器学习框架。它是由Yandex开发的开源工具,在许多数据科学任务中都表现出色。 CatBoost的名称源自其中一个主要特性,即能够自动处理分类特征(Categorical Features)。传统的梯度提升决策树框架在处理分类特征时需要将其转换为数值型特征,而CatBoost能够直接使用原始的分类特征,无需额外的预处理步骤。 在CatBoost中,采用了一系列技术来提高模型的泛化能力和准确性,包括: 对称二叉树结构:通过对树进行对称分割,可以使得模型更容易捕捉特征之间的交互关系。 梯度优化:采用特定的目标函数和优化算法,能够更有效地进行模型训练和参数优化。 特征组合的自动处理:CatBoost可以自动识别和利用特征之间的组合关系,从而提高模型的预测能力。 目标函数的自定义支持:CatBoost允许用户自定义目标函数,以满足特定的任务需求。 此外,CatBoost还提供了一些其他的功能和优势,如处理缺失值、支持GPU加速、自动调整超参数等。 CatBoost是一种基于梯度提升决策树的机器学习框架,具有自动处理分类特征和其他优化技术的特点,可以用于分类和回归任务。它是一个强大而灵活的工具,在许多实际应用中都表现出良好的性能。
1 日志数据提取到data.csv
# -*- coding: utf-8 -*- """ Created on Tue Oct 17 14:29:10 2023 @author: Administrator """ import re import csv from datetime import datetime def is_numeric(input_str): if input_str.isdigit(): return True try: float(input_str) return True except ValueError: return False # 使用 'utf-8' 编码方式打开文件 csv_file = open("data.csv", mode='a', encoding='utf-8', newline='') csv_writer = csv.writer(csv_file) title_array=["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","target"] csv_writer.writerow(title_array) ftmp=None #2023/10/19 17:41:16 [error] 18607#18607: k00=173.82.252.116&k01=www.xxx.com&k02=/404.html&k03=html&k04=&k05=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36&k06=787&k07=&k08=https://www.xxx.com/admin/&k09=https&k10=GET&k11=31&k12=0&k13=2774&k14=35&k15=15&k16=15&k17=0&k18=0&k19=2&k20=2&k21=2326&k22=12&k23=111&k24=3&k25=200&k26=707&k27=2128837&k28=27667921&k29=0&k30=0&k31=0&k32=0& #2023/10/19 17:41:16 [error] 18607#18607: k00=173.82.252.116&k01=www.xx.com&k02=/api/sys/dns/gtm/source/page&k03=&k04=&k05=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36&k06=787&k07=US&k08=https://www.xxx.com/admin/&k09=https&k10=POST&k11=32&k12=0&k13=2774&k14=36&k15=16&k16=16&k17=0&k18=0&k19=2&k20=2&k21=2327&k22=13&k23=112&k24=3&k25=200&k26=3794&k27=2129624&k28=27671715&k29=0&k30=0&k31=0&k32=0& with open("as_access.log", "r",encoding="utf-8",errors="ignore") as file: for line_num, line in enumerate(file, 1): try: if line_num>100: print(f"Line {line_num}") break pattern = r'^(.*)\s+\[error\]\s*\d+#\d+:\s+(.*)' # 匹配字符串中的日期和时间信息 match = re.match(pattern, line.strip()) data_array=[] if match: timestamp = datetime.strptime(match.groups()[0], '%Y/%m/%d %H:%M:%S').timestamp() if None==ftmp: ftmp= timestamp t= timestamp-ftmp ftmp= timestamp data_array.append(t) hytxt=match.groups()[1] pattern_hy = r"k\d+=(.*?)(&|$)" matches_hy = re.findall(pattern_hy, hytxt) for match_hy in matches_hy: if len(str(match_hy[0]))>0 : txt=str(match_hy[0]); data_array.append(txt) #if is_numeric(txt): #data_array.append(txt) #else: # data_array.append(len(txt)) else: data_array.append("0") data_array.append("0") if data_array and any([isinstance(e, str) and e.strip() for e in data_array]): #print(data_array) csv_writer.writerow(data_array) except Exception as e: print(e) pass csv_file.close()
2 对数据源进行建模与预测
# -*- coding: utf-8 -*- """ Created on Tue Oct 17 14:20:41 2023 @author: Administrator """ # 导入所需的库 import pandas as pd from catboost import CatBoostClassifier,CatBoostRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.preprocessing import OneHotEncoder # 加载数据 data = pd.read_csv("data.csv",encoding="utf-8") X = data.drop(["target"], axis=1) y = data["target"] #print(X.isna()) #print(y.isna()) # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #print(len(X_train)) #print(len(y_train)) #print(X_train) # 定义类别型特征的索引或列名列表 cat_features = X_train.select_dtypes(include=['object']).columns.tolist() # 定义 CatBoost 分类器并拟合模型 model = CatBoostClassifier( iterations=100, cat_features=cat_features ) # 拟合模型 y_train_list = y_train.tolist() model.fit(X_train, y_train_list) # 保存模型 model.save_model("catboost_waf_model.bin") # 在测试集上进行预测 y_pred = model.predict(X_test) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) print("Accuracy: {:.2f}%".format(accuracy * 100)) ##使用模型 # 加载模型 #model = CatBoostClassifier() #model.load_model("catboost_model.bin") # 加载模型 #model = CatBoostClassifier() #model.load_model("catboost_model.bin")