5 v' L( W! H! O# i
此部分为智慧海洋建设竞赛的数据分析模块,通过数据分析,可以熟悉数据,为后面的特征工程做准备,欢迎大家后续多多交流。 赛题:智慧海洋建设4 d6 s& Q W. r7 O' D1 l" U
数据分析的目的: ' T; N" B+ V3 Z3 e
EDA的主要价值在于熟悉整个数据集的基本情况(缺失值、异常值),来确定所获得数据集可以用于接下来的机器学习或者深度学习使用。了解特征之间的相关性、分布,以及特征与预测值之间的关系。为进行特征工程提供理论依据。项目地址:https://github.com/datawhalechina/team-learning-data-mining/tree/master/wisdomOcean比赛地址:https://tianchi.aliyun.com/competition/entrance/231768/introduction?spm=5176.12281957.1004.8.4ac63eafE1rwsY % t0 r6 o- j0 O4 { }% g
2.1 学习目标 学习如何对数据集整体概况进行分析,包括数据集的基本情况(缺失值、异常值)学习了解变量之间的相互关系、变量与预测值之间的存在关系。完成相应学习打卡任务2.2 内容介绍 数据总体了解读取数据集并了解数据集的大小,原始特征维度;通过info了解数据类型;粗略查看数据集中各特征的基本统计量缺失值和唯一值查看数据缺失值情况查看唯一值情况数据特性和特征分布 & U, x, W4 h0 X4 k2 c& ^) x6 _
三类渔船轨迹的可视化坐标序列可视化三类渔船速度和方向序列可视化三类渔船速度和方向的数据分布 作业一:剔除异常点后画图import pandas as pd . N0 h* N+ O+ T4 @- _
import geopandas as gpd
4 ]* c+ N& B& \ M& q5 a/ V from pyproj import Proj 2 |( j1 _* r. Y E. f4 t
from keplergl import KeplerGl
+ Y" R) C+ H) D! {% O/ G7 g' R7 Y+ W* C from tqdm import tqdm
% U( T& H, W; h0 t# A import os . G) ?. g! o7 v
import matplotlib.pyplot as plt ; D8 P/ h( s# |# @; ~; G2 Q5 R
import shapely ! o) N/ a: a- E7 ^% V
import numpy as np
4 |9 z" @- z* q# m. w) W from datetime import datetime
! P! Z) N! t5 h. Y1 g" {( O4 V import warnings
" m8 T6 e- u. b: G& t( i warnings.filterwarnings(ignore)
! _6 @8 ~( I2 c( o plt.rcParams[font.sans-serif] = [SimSun] # 指定默认字体为新宋体。 1 {9 F7 ?$ D$ |, H @- p! y
plt.rcParams[axes.unicode_minus] = False # 解决保存图像时 负号- 显示为方块和报错的问题。 / p2 W( K+ p a. T* Z
#获取文件夹中的数据 ' W6 p, W; ~' f' F8 a7 K( o5 h8 [
def get_data(file_path,model):
1 c. F8 Y6 w: q/ d: R+ ?2 V; i# c: A assert model in [train, test], {} Not Support this type of file.format(model)
4 f+ A R O$ G0 s; U* U9 k7 A/ W' N paths = os.listdir(file_path) 0 @3 B9 w* A; y: q+ x
# print(len(paths))
/ G. Y) a) o: t/ u7 v# m3 N tmp = []
; G- ~: @- F5 Q$ f/ h for t in tqdm(range(len(paths))):
2 ]3 R: m6 l# D5 ~% r' M! M p = paths[t] k1 F! z! [6 f4 v3 G* t4 Y0 }
with open({}/{}.format(file_path, p), encoding=utf-8) as f: # g* q, K" @, u& w% W
next(f) ! }/ K1 x8 Y$ {' ?7 _& v
for line in f.readlines():
1 I, ]# {0 f6 p L. V- G3 G5 s+ f% z tmp.append(line.strip().split(,))
q, T( x5 Z/ \* m$ R( _7 [+ x tmp_df = pd.DataFrame(tmp) # P, N5 h( i) n2 l1 E
if model == train:
" _! {- Z* J8 F5 \/ b tmp_df.columns = [ID, lat, lon, speed, direction, time, type] , r/ U1 l$ e" \) R* D B Q
else: # k* W6 C& T7 l4 J; F) e5 C# g
tmp_df[type] = unknown 2 x7 X: y3 r; S% p/ b
tmp_df.columns = [ID, lat, lon, speed, direction, time, type]
! G( e8 _/ B c, b: H tmp_df[lat] = tmp_df[lat].astype(float) * {! [, o8 Y+ y. e8 q; \& R
tmp_df[lon] = tmp_df[lon].astype(float) + m9 `3 l* C, b, n+ M
tmp_df[speed] = tmp_df[speed].astype(float) 8 \8 b+ {" v' Q+ f& m, z! y1 Y
tmp_df[direction] = tmp_df[direction].astype(int)#如果该行代码运行失败,请尝试更新pandas的版本 , u' ^# @6 Y: ?8 x
return tmp_df
; _- ]) O+ T0 Z/ v5 Y1 T+ _ [ # 平面坐标转经纬度,供初赛数据使用 ) q2 s$ J6 L) V7 l
# 选择标准为NAD83 / California zone 6 (ftUS) (EPSG:2230),查询链接:CS2CS - Transform Coordinates On-line - MyGeodata Cloud 5 Z5 a l& _3 o# j/ M; `
def transform_xy2lonlat(df):
) w0 }6 A, l) ^) h) X x = df[lat].values
' x9 [( ~% w6 p0 V7 S2 P- _ y = df[lon].values
' @/ |! A0 ?! p6 M2 K p=Proj(+proj=lcc +lat_1=33.88333333333333 +lat_2=32.78333333333333 +lat_0=32.16666666666666 +lon_0=-116.25 +x_0=2000000.0001016 +y_0=500000.0001016001 +datum=NAD83 +units=us-ft +no_defs )
% W" v7 X$ k3 a" L; F9 i! w df[lon], df[lat] = p(y, x, inverse=True)
/ U; [& l0 s3 G4 }$ p) P8 o$ \2 P s return df
( M. d- L: Q% J' v7 G+ D #修改数据的时间格式 ; S: I* r r' y9 @
def reformat_strtime(time_str=None, START_YEAR="2019"):
5 g8 ?) }2 @9 s G3 h( k8 Z' } """Reformat the strtime with the form 08 14 to START_YEAR-08-14 """ 5 |; D) `. C6 H+ U% y' N& V2 E3 }2 x' Z
time_str_split = time_str.split(" ") $ k, E7 R& [7 y6 `7 m
time_str_reformat = START_YEAR + "-" + time_str_split[0][:2] + "-" + time_str_split[0][2:4]
3 ~) U* }7 D' e$ {3 H time_str_reformat = time_str_reformat + " " + time_str_split[1]
9 R! ]- e1 _0 s+ y1 _" F/ N4 D8 U # time_reformat=datetime.strptime(time_str_reformat,%Y-%m-%d %H:%M:%S) / W v `5 V8 Z# k
return time_str_reformat $ O: c3 E$ }, T
#计算两个点的距离
4 r% d0 m9 p# J def haversine_np(lon1, lat1, lon2, lat2): / `7 U. G* j( }% W$ ]: `
lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
- {0 N9 W, I/ i dlon = lon2 - lon1
" b* v2 R3 U. ~- o# h5 A dlat = lat2 - lat1
2 M& S% G* R0 l% L a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
- S" x- ~8 F; r6 T! Y5 a% i c = 2 * np.arcsin(np.sqrt(a)) $ b( @7 x- c+ [8 F; ^
km = 6367 * c
5 ?! O5 U9 s4 J g# }) v( s return km * 1000
) E# O7 h% r& k2 Z& C1 P# f def compute_traj_diff_time_distance(traj=None):
, O% G- K$ k) X" l """Compute the sampling time and the coordinate distance."""
4 H$ s. b$ F3 Z # 计算时间的差值
, k, b* {- q$ u2 h- o- u) e n3 G time_diff_array = (traj["time"].iloc[1:].reset_index(drop=True) - traj[
! Z" Y/ P" o4 B+ G0 v% h. S "time"].iloc[:-1].reset_index(drop=True)).dt.total_seconds() / 60 ) G. u; t8 }( s9 x* ^4 D
# 计算坐标之间的距离 4 }- |7 L) T( ^" x* K& c
dist_diff_array = haversine_np(traj["lon"].values[1:], # lon_0 , [+ y$ T2 t/ @! c
traj["lat"].values[1:], # lat_0
( Y. t$ U: A! @) g. Z6 r traj["lon"].values[:-1], # lon_1
& f9 r( }% X- o2 b6 E1 ^ traj["lat"].values[:-1] # lat_1 * f8 b+ S) y" i4 _8 U/ x
)
( z) q/ ~' y& e, Y& c # 填充第一个值 5 n% j- Z2 N0 U
time_diff_array = [time_diff_array.mean()] + time_diff_array.tolist()
1 e3 Y# [+ ~) E dist_diff_array = [dist_diff_array.mean()] + dist_diff_array.tolist() 8 H0 s. _3 Q& O6 s7 Q
traj.loc[list(traj.index),time_array] = time_diff_array * T) ~9 c5 u4 o" A
traj.loc[list(traj.index),dist_array] = dist_diff_array 5 I6 f" m" d) ]/ F4 l1 I
return traj 9 W; z* c) A5 y; b" K/ d) @2 u
#对轨迹进行异常点的剔除
5 \) h) B: A4 J def assign_traj_anomaly_points_nan(traj=None, speed_maximum=23, & b, A" y& N: I8 E/ w/ F& M
time_interval_maximum=200,
) L0 N x# R) O6 D0 O# g1 c% B coord_speed_maximum=700): . K* {$ {( {3 _' u8 f) ]
"""Assign the anomaly points in traj to np.nan."""
+ H7 V/ Q% P/ r4 F$ ~; K def thigma_data(data_y,n):
' t9 F8 u4 y1 J data_x =[i for i in range(len(data_y))]
2 T2 Q! L; B' r+ d! c- Q ymean = np.mean(data_y) # j. z2 e' A( M3 l' b X
ystd = np.std(data_y)
1 D5 M4 C2 F# ~6 d threshold1 = ymean - n * ystd 5 J% q4 b+ f4 d* r! o4 E, v
threshold2 = ymean + n * ystd
/ H+ a2 ]; |+ x1 L judge=[] " T: E: n' T$ `' y i
for data in data_y: 5 T( [4 |4 O/ D8 Q
if (data < threshold1)|(data> threshold2):
# G: E' D5 F' T8 T& K" ]$ P judge.append(True) : q' \" Y1 ~ Q6 w/ b" m: |
else:
1 v O+ G* r2 f1 ]- Z2 s% h& \ judge.append(False) 7 d) G2 a# {1 z0 L `. X. g' o$ g
return judge 7 Z$ X% a) K4 \0 r: t9 o
# Step 1: The speed anomaly repairing 1 x7 Q2 Z' l3 `
is_speed_anomaly = (traj["speed"] > speed_maximum) | (traj["speed"] < 0)
' d$ n# G6 ^$ x3 c6 u traj["speed"][is_speed_anomaly] = np.nan $ t" G, _* D0 A
# Step 2: 根据距离和时间计算速度
! T8 u) E0 F, l1 w" x is_anomaly = np.array([False] * len(traj))
- G. T8 }6 u' ?- k* B2 i5 ? traj["coord_speed"] = traj["dist_array"] / traj["time_array"]
& S3 V* E) B+ S # Condition 1: 根据3-sigma算法剔除coord speed以及较大时间间隔的点
* Q8 u6 E. _6 l7 q2 o is_anomaly_tmp = pd.Series(thigma_data(traj["time_array"],3)) | pd.Series(thigma_data(traj["coord_speed"],3))
9 S* Q6 E/ ~& U- J: F is_anomaly = is_anomaly | is_anomaly_tmp , j, y' g; r& a4 u! R
is_anomaly.index=traj.index " k: X- Q, o, l8 P. _" d9 u
# Condition 2: 轨迹点的3-sigma异常处理 ; ^" Z' v5 d6 W; n, A- u6 M! I6 u
traj = traj[~is_anomaly].reset_index(drop=True)
" `& P: k) Z- Q8 n) U, L* W7 K is_anomaly = np.array([False] * len(traj))
. g0 }2 |( ]: v5 L: Q, _- X0 g% j; w if len(traj) != 0:
+ G6 b! {5 @3 Z lon_std, lon_mean = traj["lon"].std(), traj["lon"].mean() . i* y2 j" j7 C; ^4 L4 K6 m2 @
lat_std, lat_mean = traj["lat"].std(), traj["lat"].mean()
7 i; N8 G6 S7 X3 r& g( K lon_low, lon_high = lon_mean - 3 * lon_std, lon_mean + 3 * lon_std
3 d4 `6 W- J8 j: K5 f lat_low, lat_high = lat_mean - 3 * lat_std, lat_mean + 3 * lat_std
: o& p/ \# N( x& b2 f is_anomaly = is_anomaly | (traj["lon"] > lon_high) | ((traj["lon"] < lon_low))
. E5 B; b5 I1 R# i. B- P3 Z' E, `+ c! f is_anomaly = is_anomaly | (traj["lat"] > lat_high) | ((traj["lat"] < lat_low))
3 X+ W6 _: d% L: W1 T0 o& { traj = traj[~is_anomaly].reset_index(drop=True) 7 l3 `7 Z2 ?$ q$ x! L; M) e7 n6 C
return traj, [len(is_speed_anomaly) - len(traj)]
* Y( v9 I9 j( H/ T6 C. A df=get_data(rC:\Users\admin\hy_round1_train_20200102,train)
5 ~6 e% e5 C5 t4 D #对轨迹进行异常点剔除,对nan值进行线性插值 : V4 |8 T3 J( Y5 H- Y8 G6 U
ID_list=list(pd.DataFrame(df[ID].value_counts()).index) 0 [. N. u) ]$ \8 b
DF_NEW=[] , E) N7 z: G' O/ L8 c
Anomaly_count=[]
3 p1 v7 ?9 g `# r for ID in tqdm(ID_list):
& W8 P% s* e5 ~" J df_id=compute_traj_diff_time_distance(df[df[ID]==ID]) . z2 b$ ?3 _6 Y) y+ y! Y s
df_new,count=assign_traj_anomaly_points_nan(df_id)
) _) ?! r$ h1 @/ l, |' N% V- v df_new["speed"] = df_new["speed"].interpolate(method="linear", axis=0) / I1 f& h B( l7 e
df_new = df_new.fillna(method="bfill")
8 j8 P* e. R- a I/ N* s2 } df_new = df_new.fillna(method="ffill") 3 t# `' X7 k9 [+ L" O0 y! g
df_new["speed"] = df_new["speed"].clip(0, 23) . o2 r4 a h. o
Anomaly_count.append(count)#统计每个id异常点的数量有多少
! M5 g) H' J5 x" r4 Q DF_NEW.append(df_new)
5 K8 `$ T1 I( [ #将数据写入到pkl格式
) F$ U) m1 X# R7 V: M* [9 R% F load_save = Load_Save_Data()
4 ~# w r- Z; r! m* Q! ` load_save.save_data(DF_NEW,"C:/Users/admin/wisdomOcean/data_tmp1/total_data.pkl") * o2 j$ e6 J$ T7 h" w) U3 {3 H
#### 三类渔船速度和方向可视化 $ P$ i# l3 L" {0 U8 z5 }# i
# 把训练集的所有数据,根据类别存放到不同的数据文件中 7 m6 u- Z% }! }) F/ t3 Y
def get_diff_data():
0 b6 f- D# j( i( N W8 v/ v5 k1 m Path = "C:/Users/admin/wisdomOcean/data_tmp1/total_data.pkl"
" @8 r! B; \: I L with open(Path,"rb") as f:
* H! O" J/ S' @% B5 P _4 E total_data = pickle.load(f)
# A) ^ q; h8 I* n load_save = Load_Save_Data()
, s3 Q$ K6 w0 m0 ~ kind_data = ["刺网","围网","拖网"]
( m$ Q6 y) |+ A0 G' J7 u file_names = ["ciwang_data.pkl","weiwang_data.pkl","tuowang_data.pkl"] ' m* L5 D7 T- C
for i,datax in enumerate(kind_data):
& r% G5 R% U0 H) g5 g3 j2 E# d data_type = [data for data in total_data if data["type"].unique()[0] == datax] " b' z5 ] D* H5 @
load_save.save_data(data_type,"C:/Users/admin/wisdomOcean/data_tmp1/" + file_names[i]) 1 x) A4 [7 [1 p6 w+ h$ n8 t5 Z& f7 M( l
get_diff_data()
! ]/ ?3 k0 \7 Z/ D- _ #对轨迹进行异常点剔除,对nan值进行线性插值
7 ?' w9 e' ?7 \/ F6 `$ Y9 N ID_list=list(pd.DataFrame(df[ID].value_counts()).index)
' x5 v6 M7 [ U DF_NEW=[]
7 v E5 [! z$ A7 [, ? Anomaly_count=[] % c% L% [/ U2 X
for ID in tqdm(ID_list): 6 N! s9 p# n2 O' F! D1 w( K
df_id=compute_traj_diff_time_distance(df[df[ID]==ID])
& D+ N1 F+ I) j, @' c: U* g df_new,count=assign_traj_anomaly_points_nan(df_id)
# Q/ F& {" F( N: _" j# X1 J; w df_new["speed"] = df_new["speed"].interpolate(method="linear", axis=0) 2 v' C1 B& [& p0 e2 R2 X
df_new = df_new.fillna(method="bfill")
" h/ [ e4 |4 h1 t6 B4 p df_new = df_new.fillna(method="ffill") 9 ~# S3 Y8 ]6 [$ J
df_new["speed"] = df_new["speed"].clip(0, 23) ! X- M1 r# m' O4 N- E! b9 q* B
Anomaly_count.append(count)#统计每个id异常点的数量有多少 # D+ h- B$ B4 k9 l1 y" u1 P
DF_NEW.append(df_new) 0 r: \8 C2 L8 l* G% u
# 每类轨迹,随机选取某个渔船,可视化速度序列和方向序列
# K% j6 x3 |- g9 Z def visualize_three_traj_speed_direction(): ( T; N3 _9 P1 X
fig,axes = plt.subplots(nrows=3,ncols=2,figsize=(20,15))
; y! P8 f2 o. r; p plt.subplots_adjust(wspace=0.3,hspace=0.3) ) P1 n \8 U% R1 f
# 随机选出刺网的三条轨迹进行可视化 & J8 |: J; S {1 I+ p$ b6 U& w
file_types = ["ciwang_data","weiwang_data","tuowang_data"]
5 v) D' q' ~- L s2 b, S speed_types = ["ciwang_speed","weiwang_speed","tuowang_speed"]
' I0 [; O3 r1 d' n7 v e& a doirections = ["ciwang_direction","weiwang_direction","tuowang_direction"]
# ?! ~) i2 J, A( I1 N/ g0 @ colors = [pink, lightblue, lightgreen] % o) F C& t# t# \. P5 N `- a
for i,file_name in tqdm(enumerate(file_types)): 6 t3 y$ P+ [" i+ W* ~/ w
datax = get_random_one_traj(type=file_name)
; {7 a$ v3 @- ^' J x_data = datax["速度"].loc[-1:].values
! _4 ~: }) z1 F% o' ^ y_data = datax["方向"].loc[-1:].values
# Y3 h3 ]. ^ v/ L axes[i][0].plot(range(len(x_data)), x_data, label=speed_types[i], color=colors[i]) 4 b3 O$ u; ]% B6 C8 s$ j0 W6 ^
axes[i][0].grid(alpha=2)
4 ]% d# X" _( \/ O2 Q axes[i][0].legend(loc="best") , {( O X. F2 `; ~5 e
axes[i][1].plot(range(len(y_data)), y_data, label=doirections[i], color=colors[i])
) ^. X+ `' \8 P. {! B* Q$ k axes[i][1].grid(alpha=2) : K, T+ Q5 }, u5 T: j% G! b
axes[i][1].legend(loc="best") 4 b! L. K J7 J, W7 p& \4 ^8 {
plt.show() 8 o5 E. B# m5 q/ J% Z. k7 E( Q9 `
visualize_three_traj_speed_direction() 3 t' E+ h( G3 B6 s9 e S( K: r) O
8 y9 L9 ~) A! e- P0 p x7 R
作业二:相关性分析。
: g9 ^3 s% i, r% w; q data_train.loc[data_train[type]==刺网,type_id]=1
! T: d# K0 V3 u( Z+ r) q, ]/ q e a: u data_train.loc[data_train[type]==围网,type_id]=2
# m( K- a" n. L: K8 D data_train.loc[data_train[type]==拖网,type_id]=3 ' b1 C9 K, K/ \! [+ Y: h2 H6 ~5 Z+ O9 l
f, ax = plt.subplots(figsize=(9, 6))
+ ]7 N: X/ K8 B& w2 I/ L- N ax = sns.heatmap(np.abs(df.corr()),annot=True) 5 ~3 w/ P& B8 {: m4 Y% H
plt.show()
$ W0 |7 N: a( f K" Y
9 y* ]. y g7 `! L. ~ 从图中可以清楚看到,经纬度和速度跟类型相关性比较大。
1 E; i& \% T$ Y( A& R1 Q8 `
1 v- L: q3 v% [/ ^" }: {0 W- x+ Z+ ` S) z4 E l
2 @6 \; s( C; X5 e
! I& s- N5 P" o8 I1 m% b. V5 H7 Z9 p
|