Datawhale 智慧海洋建设-Task2 数据分析

陌羡尘 · 发表于数据处理软件 2022-11-16 23:59:18

此部分为智慧海洋建设竞赛的数据分析模块，通过数据分析，可以熟悉数据，为后面的特征工程做准备，欢迎大家后续多多交流。

赛题：智慧海洋建设

数据分析的目的:

EDA的主要价值在于熟悉整个数据集的基本情况(缺失值、异常值)，来确定所获得数据集可以用于接下来的机器学习或者深度学习使用。了解特征之间的相关性、分布，以及特征与预测值之间的关系。为进行特征工程提供理论依据。

项目地址：https://github.com/datawhalechina/team-learning-data-mining/tree/master/wisdomOcean比赛地址：https://tianchi.aliyun.com/competition/entrance/231768/introduction?spm=5176.12281957.1004.8.4ac63eafE1rwsY

2.1 学习目标

学习如何对数据集整体概况进行分析，包括数据集的基本情况(缺失值、异常值)学习了解变量之间的相互关系、变量与预测值之间的存在关系。完成相应学习打卡任务

2.2 内容介绍

数据总体了解读取数据集并了解数据集的大小，原始特征维度；通过info了解数据类型；粗略查看数据集中各特征的基本统计量缺失值和唯一值查看数据缺失值情况查看唯一值情况

数据特性和特征分布

三类渔船轨迹的可视化坐标序列可视化三类渔船速度和方向序列可视化三类渔船速度和方向的数据分布

作业一：剔除异常点后画图

import pandas as pd

import geopandas as gpd

from pyproj import Proj

from keplergl import KeplerGl

from tqdm import tqdm

import os

import matplotlib.pyplot as plt

import shapely

import numpy as np

from datetime import datetime

import warnings

warnings.filterwarnings(ignore)

plt.rcParams[font.sans-serif] = [SimSun] # 指定默认字体为新宋体。

plt.rcParams[axes.unicode_minus] = False # 解决保存图像时负号- 显示为方块和报错的问题。

#获取文件夹中的数据

def get_data(file_path,model):

assert model in [train, test], {} Not Support this type of file.format(model)

paths = os.listdir(file_path)

# print(len(paths))

tmp = []

for t in tqdm(range(len(paths))):

p = paths[t]

with open({}/{}.format(file_path, p), encoding=utf-8) as f:

next(f)

for line in f.readlines():

tmp.append(line.strip().split(,))

tmp_df = pd.DataFrame(tmp)

if model == train:

tmp_df.columns = [ID, lat, lon, speed, direction, time, type]

else:

tmp_df[type] = unknown

tmp_df.columns = [ID, lat, lon, speed, direction, time, type]

tmp_df[lat] = tmp_df[lat].astype(float)

tmp_df[lon] = tmp_df[lon].astype(float)

tmp_df[speed] = tmp_df[speed].astype(float)

tmp_df[direction] = tmp_df[direction].astype(int)#如果该行代码运行失败，请尝试更新pandas的版本

return tmp_df

# 平面坐标转经纬度，供初赛数据使用

# 选择标准为NAD83 / California zone 6 (ftUS) (EPSG:2230)，查询链接：CS2CS - Transform Coordinates On-line - MyGeodata Cloud

def transform_xy2lonlat(df):

x = df[lat].values

y = df[lon].values

p=Proj(+proj=lcc +lat_1=33.88333333333333 +lat_2=32.78333333333333 +lat_0=32.16666666666666 +lon_0=-116.25 +x_0=2000000.0001016 +y_0=500000.0001016001 +datum=NAD83 +units=us-ft +no_defs )

df[lon], df[lat] = p(y, x, inverse=True)

return df

#修改数据的时间格式

def reformat_strtime(time_str=None, START_YEAR="2019"):

"""Reformat the strtime with the form 08 14 to START_YEAR-08-14 """

time_str_split = time_str.split(" ")

time_str_reformat = START_YEAR + "-" + time_str_split[0][:2] + "-" + time_str_split[0][2:4]

time_str_reformat = time_str_reformat + " " + time_str_split[1]

# time_reformat=datetime.strptime(time_str_reformat,%Y-%m-%d %H:%M:%S)

return time_str_reformat

#计算两个点的距离

def haversine_np(lon1, lat1, lon2, lat2):

lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

dlon = lon2 - lon1

dlat = lat2 - lat1

a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

c = 2 * np.arcsin(np.sqrt(a))

km = 6367 * c

return km * 1000

def compute_traj_diff_time_distance(traj=None):

"""Compute the sampling time and the coordinate distance."""

# 计算时间的差值

time_diff_array = (traj["time"].iloc[1:].reset_index(drop=True) - traj[

"time"].iloc[:-1].reset_index(drop=True)).dt.total_seconds() / 60

# 计算坐标之间的距离

dist_diff_array = haversine_np(traj["lon"].values[1:], # lon_0

traj["lat"].values[1:], # lat_0

traj["lon"].values[:-1], # lon_1

traj["lat"].values[:-1] # lat_1

)

# 填充第一个值

time_diff_array = [time_diff_array.mean()] + time_diff_array.tolist()

dist_diff_array = [dist_diff_array.mean()] + dist_diff_array.tolist()

traj.loc[list(traj.index),time_array] = time_diff_array

traj.loc[list(traj.index),dist_array] = dist_diff_array

return traj

#对轨迹进行异常点的剔除

def assign_traj_anomaly_points_nan(traj=None, speed_maximum=23,

time_interval_maximum=200,

coord_speed_maximum=700):

"""Assign the anomaly points in traj to np.nan."""

def thigma_data(data_y,n):

data_x =[i for i in range(len(data_y))]

ymean = np.mean(data_y)

ystd = np.std(data_y)

threshold1 = ymean - n * ystd

threshold2 = ymean + n * ystd

judge=[]

for data in data_y:

if (data < threshold1)|(data> threshold2):

judge.append(True)

else:

judge.append(False)

return judge

# Step 1: The speed anomaly repairing

is_speed_anomaly = (traj["speed"] > speed_maximum) | (traj["speed"] < 0)

traj["speed"][is_speed_anomaly] = np.nan

# Step 2: 根据距离和时间计算速度

is_anomaly = np.array([False] * len(traj))

traj["coord_speed"] = traj["dist_array"] / traj["time_array"]

# Condition 1: 根据3-sigma算法剔除coord speed以及较大时间间隔的点

is_anomaly_tmp = pd.Series(thigma_data(traj["time_array"],3)) | pd.Series(thigma_data(traj["coord_speed"],3))

is_anomaly = is_anomaly | is_anomaly_tmp

is_anomaly.index=traj.index

# Condition 2: 轨迹点的3-sigma异常处理

traj = traj[~is_anomaly].reset_index(drop=True)

is_anomaly = np.array([False] * len(traj))

if len(traj) != 0:

lon_std, lon_mean = traj["lon"].std(), traj["lon"].mean()

lat_std, lat_mean = traj["lat"].std(), traj["lat"].mean()

lon_low, lon_high = lon_mean - 3 * lon_std, lon_mean + 3 * lon_std

lat_low, lat_high = lat_mean - 3 * lat_std, lat_mean + 3 * lat_std

is_anomaly = is_anomaly | (traj["lon"] > lon_high) | ((traj["lon"] < lon_low))

is_anomaly = is_anomaly | (traj["lat"] > lat_high) | ((traj["lat"] < lat_low))

traj = traj[~is_anomaly].reset_index(drop=True)

return traj, [len(is_speed_anomaly) - len(traj)]

df=get_data(rC:\Users\admin\hy_round1_train_20200102,train)

#对轨迹进行异常点剔除，对nan值进行线性插值

ID_list=list(pd.DataFrame(df[ID].value_counts()).index)

DF_NEW=[]

Anomaly_count=[]

for ID in tqdm(ID_list):

df_id=compute_traj_diff_time_distance(df[df[ID]==ID])

df_new,count=assign_traj_anomaly_points_nan(df_id)

df_new["speed"] = df_new["speed"].interpolate(method="linear", axis=0)

df_new = df_new.fillna(method="bfill")

df_new = df_new.fillna(method="ffill")

df_new["speed"] = df_new["speed"].clip(0, 23)

Anomaly_count.append(count)#统计每个id异常点的数量有多少

DF_NEW.append(df_new)

#将数据写入到pkl格式

load_save = Load_Save_Data()

load_save.save_data(DF_NEW,"C:/Users/admin/wisdomOcean/data_tmp1/total_data.pkl")

#### 三类渔船速度和方向可视化

# 把训练集的所有数据,根据类别存放到不同的数据文件中

def get_diff_data():

Path = "C:/Users/admin/wisdomOcean/data_tmp1/total_data.pkl"

with open(Path,"rb") as f:

total_data = pickle.load(f)

load_save = Load_Save_Data()

kind_data = ["刺网","围网","拖网"]

file_names = ["ciwang_data.pkl","weiwang_data.pkl","tuowang_data.pkl"]

for i,datax in enumerate(kind_data):

data_type = [data for data in total_data if data["type"].unique()[0] == datax]

load_save.save_data(data_type,"C:/Users/admin/wisdomOcean/data_tmp1/" + file_names[i])

get_diff_data()

#对轨迹进行异常点剔除，对nan值进行线性插值

ID_list=list(pd.DataFrame(df[ID].value_counts()).index)

DF_NEW=[]

Anomaly_count=[]

for ID in tqdm(ID_list):

df_id=compute_traj_diff_time_distance(df[df[ID]==ID])

df_new,count=assign_traj_anomaly_points_nan(df_id)

df_new["speed"] = df_new["speed"].interpolate(method="linear", axis=0)

df_new = df_new.fillna(method="bfill")

df_new = df_new.fillna(method="ffill")

df_new["speed"] = df_new["speed"].clip(0, 23)

Anomaly_count.append(count)#统计每个id异常点的数量有多少

DF_NEW.append(df_new)

# 每类轨迹，随机选取某个渔船，可视化速度序列和方向序列

def visualize_three_traj_speed_direction():

fig,axes = plt.subplots(nrows=3,ncols=2,figsize=(20,15))

plt.subplots_adjust(wspace=0.3,hspace=0.3)

# 随机选出刺网的三条轨迹进行可视化

file_types = ["ciwang_data","weiwang_data","tuowang_data"]

speed_types = ["ciwang_speed","weiwang_speed","tuowang_speed"]

doirections = ["ciwang_direction","weiwang_direction","tuowang_direction"]

colors = [pink, lightblue, lightgreen]

for i,file_name in tqdm(enumerate(file_types)):

datax = get_random_one_traj(type=file_name)

x_data = datax["速度"].loc[-1:].values

y_data = datax["方向"].loc[-1:].values

axes[i][0].plot(range(len(x_data)), x_data, label=speed_types[i], color=colors[i])

axes[i][0].grid(alpha=2)

axes[i][0].legend(loc="best")

axes[i][1].plot(range(len(y_data)), y_data, label=doirections[i], color=colors[i])

axes[i][1].grid(alpha=2)

axes[i][1].legend(loc="best")

plt.show()

visualize_three_traj_speed_direction()

作业二：相关性分析。

data_train.loc[data_train[type]==刺网,type_id]=1

data_train.loc[data_train[type]==围网,type_id]=2

data_train.loc[data_train[type]==拖网,type_id]=3

f, ax = plt.subplots(figsize=(9, 6))

ax = sns.heatmap(np.abs(df.corr()),annot=True)

plt.show()

从图中可以清楚看到，经纬度和速度跟类型相关性比较大。

Datawhale 智慧海洋建设-Task2 数据分析

您看了很久哦，登陆下吧！