1
2
3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
1
2
3
4
data = pd.read_csv("../data/Turbine_Data.csv",
low_memory=False,
parse_dates=["Unnamed: 0"])
data.tail()
Unnamed: 0 ActivePower AmbientTemperatue BearingShaftTemperature Blade1PitchAngle Blade2PitchAngle Blade3PitchAngle ControlBoxTemperature GearboxBearingTemperature GearboxOilTemperature ... GeneratorWinding2Temperature HubTemperature MainBoxTemperature NacellePosition ReactivePower RotorRPM TurbineStatus WTG WindDirection WindSpeed
118219 2020-03-30 23:10:00+00:00 70.044465 27.523741 45.711129 1.515669 1.950088 1.950088 0.0 59.821165 55.193793 ... 58.148777 39.008931 36.476562 178.0 13.775785 9.234004 2.0 G01 178.0 3.533445
118220 2020-03-30 23:20:00+00:00 40.833474 27.602882 45.598573 1.702809 2.136732 2.136732 0.0 59.142038 54.798545 ... 57.550367 39.006759 36.328125 178.0 8.088928 9.229370 2.0 G01 178.0 3.261231
118221 2020-03-30 23:30:00+00:00 20.777790 27.560925 45.462045 1.706214 2.139664 2.139664 0.0 58.439439 54.380456 ... 57.099335 39.003815 36.131944 178.0 4.355978 9.236802 2.0 G01 178.0 3.331839
118222 2020-03-30 23:40:00+00:00 62.091039 27.810472 45.343827 1.575352 2.009781 2.009781 0.0 58.205413 54.079014 ... 56.847239 39.003815 36.007805 190.0 12.018077 9.237374 2.0 G01 190.0 3.284468
118223 2020-03-30 23:50:00+00:00 68.664425 27.915828 45.231610 1.499323 1.933124 1.933124 0.0 58.581716 54.080505 ... 56.741040 39.003815 35.914062 203.0 14.439669 9.235532 2.0 G01 203.0 3.475205

5 rows × 22 columns

1
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118224 entries, 0 to 118223
Data columns (total 22 columns):
 #   Column                        Non-Null Count   Dtype              
---  ------                        --------------   -----              
 0   Unnamed: 0                    118224 non-null  datetime64[ns, UTC]
 1   ActivePower                   94750 non-null   float64            
 2   AmbientTemperatue             93817 non-null   float64            
 3   BearingShaftTemperature       62518 non-null   float64            
 4   Blade1PitchAngle              41996 non-null   float64            
 5   Blade2PitchAngle              41891 non-null   float64            
 6   Blade3PitchAngle              41891 non-null   float64            
 7   ControlBoxTemperature         62160 non-null   float64            
 8   GearboxBearingTemperature     62540 non-null   float64            
 9   GearboxOilTemperature         62438 non-null   float64            
 10  GeneratorRPM                  62295 non-null   float64            
 11  GeneratorWinding1Temperature  62427 non-null   float64            
 12  GeneratorWinding2Temperature  62449 non-null   float64            
 13  HubTemperature                62406 non-null   float64            
 14  MainBoxTemperature            62507 non-null   float64            
 15  NacellePosition               72278 non-null   float64            
 16  ReactivePower                 94748 non-null   float64            
 17  RotorRPM                      62127 non-null   float64            
 18  TurbineStatus                 62908 non-null   float64            
 19  WTG                           118224 non-null  object             
 20  WindDirection                 72278 non-null   float64            
 21  WindSpeed                     94595 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(20), object(1)
memory usage: 19.8+ MB
1
2
ig, ax = plt.subplots()
ax.scatter(data["Unnamed: 0"][:1000], data["ActivePower"][:1000])
<matplotlib.collections.PathCollection at 0x72912e32a800>


png

1
data.ActivePower.plot.hist()
<Axes: ylabel='Frequency'>


png

1
2
3
4
data['DateTime'] = data['Unnamed: 0'] 
data.drop('Unnamed: 0', axis=1, inplace=True) # 删除原始列 axis=1表示列 inplace=True表示在原数据上修改
data['DateTime'].head(20)

0    2017-12-31 00:00:00+00:00
1    2017-12-31 00:10:00+00:00
2    2017-12-31 00:20:00+00:00
3    2017-12-31 00:30:00+00:00
4    2017-12-31 00:40:00+00:00
5    2017-12-31 00:50:00+00:00
6    2017-12-31 01:00:00+00:00
7    2017-12-31 01:10:00+00:00
8    2017-12-31 01:20:00+00:00
9    2017-12-31 01:30:00+00:00
10   2017-12-31 01:40:00+00:00
11   2017-12-31 01:50:00+00:00
12   2017-12-31 02:00:00+00:00
13   2017-12-31 02:10:00+00:00
14   2017-12-31 02:20:00+00:00
15   2017-12-31 02:30:00+00:00
16   2017-12-31 02:40:00+00:00
17   2017-12-31 02:50:00+00:00
18   2017-12-31 03:00:00+00:00
19   2017-12-31 03:10:00+00:00
Name: DateTime, dtype: datetime64[ns, UTC]
1
2
3
4
5
6
7
8
9
10
data['DateTime'] = pd.to_datetime(data['DateTime'], 
format = '%Y-%m-%dT%H:%M:%SZ',
errors = 'coerce') # 如果在日期时间转换过程中遇到错误,将错误值转换为 NaT,处理不规则或无效日期时间数据

data['year'] = data['DateTime'].dt.year
data['month'] = data['DateTime'].dt.month
data['day'] = data['DateTime'].dt.day
data['hour'] = data['DateTime'].dt.hour
data['minute'] = data['DateTime'].dt.minute
data.drop('DateTime', axis=1, inplace= True)
1
data
ActivePower AmbientTemperatue BearingShaftTemperature Blade1PitchAngle Blade2PitchAngle Blade3PitchAngle ControlBoxTemperature GearboxBearingTemperature GearboxOilTemperature GeneratorRPM ... RotorRPM TurbineStatus WTG WindDirection WindSpeed year month day hour minute
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN G01 NaN NaN 2017 12 31 0 0
1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN G01 NaN NaN 2017 12 31 0 10
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN G01 NaN NaN 2017 12 31 0 20
3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN G01 NaN NaN 2017 12 31 0 30
4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN G01 NaN NaN 2017 12 31 0 40
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
118219 70.044465 27.523741 45.711129 1.515669 1.950088 1.950088 0.0 59.821165 55.193793 1029.870744 ... 9.234004 2.0 G01 178.0 3.533445 2020 3 30 23 10
118220 40.833474 27.602882 45.598573 1.702809 2.136732 2.136732 0.0 59.142038 54.798545 1030.160478 ... 9.229370 2.0 G01 178.0 3.261231 2020 3 30 23 20
118221 20.777790 27.560925 45.462045 1.706214 2.139664 2.139664 0.0 58.439439 54.380456 1030.137822 ... 9.236802 2.0 G01 178.0 3.331839 2020 3 30 23 30
118222 62.091039 27.810472 45.343827 1.575352 2.009781 2.009781 0.0 58.205413 54.079014 1030.178178 ... 9.237374 2.0 G01 190.0 3.284468 2020 3 30 23 40
118223 68.664425 27.915828 45.231610 1.499323 1.933124 1.933124 0.0 58.581716 54.080505 1029.834789 ... 9.235532 2.0 G01 203.0 3.475205 2020 3 30 23 50

118224 rows × 26 columns

1
data.isna().sum()
ActivePower                     23474
AmbientTemperatue               24407
BearingShaftTemperature         55706
Blade1PitchAngle                76228
Blade2PitchAngle                76333
Blade3PitchAngle                76333
ControlBoxTemperature           56064
GearboxBearingTemperature       55684
GearboxOilTemperature           55786
GeneratorRPM                    55929
GeneratorWinding1Temperature    55797
GeneratorWinding2Temperature    55775
HubTemperature                  55818
MainBoxTemperature              55717
NacellePosition                 45946
ReactivePower                   23476
RotorRPM                        56097
TurbineStatus                   55316
WTG                                 0
WindDirection                   45946
WindSpeed                       23629
year                                0
month                               0
day                                 0
hour                                0
minute                              0
dtype: int64
1
2
3
4
5
6
# 中位数替代缺失值,_is_missing标记缺失
for label, content in data.items():
if pd.api.types.is_numeric_dtype(content): # 判断是否为数值型
if pd.isnull(content).sum():
data[label+"_is_missing"] = pd.isnull(content)
data[label] = content.fillna(content.median())