单条轨迹的处理:geolife笔记:整理处理单条轨迹-CSDN博客
1 加载数据
import pandas as pd
import numpy as np
import datetime as dt
import osdata_dir = 'Geolife Trajectories 1.3/Data/'
1.1 列出所有文件夹
dirlist = os.listdir(data_dir)
dirlist
'''
['133','079','173','020','003','004','014','074',
...
'''
1.2 拼接出所有绝对路径
folder_dirs = []
for dir in dirlist: folder_dirs.append(data_dir + '/' + dir+'/'+'Trajectory')
folder_dirs
'''
['data/Geolife Trajectories 1.3/Data//133/Trajectory','data/Geolife Trajectories 1.3/Data//079/Trajectory','data/Geolife Trajectories 1.3/Data//173/Trajectory','data/Geolife Trajectories 1.3/Data//020/Trajectory','data/Geolife Trajectories 1.3/Data//003/Trajectory',
...
'''
1.3 列出所有文件
file_dirs=[]
for dir in folder_dirs:for file in os.listdir(dir):file_dirs.append(dir+'/'+file)
len(file_dirs),file_dirs
'''
(18670,['data/Geolife Trajectories 1.3/Data//133/Trajectory/20110130143621.plt','data/Geolife Trajectories 1.3/Data//133/Trajectory/20110419143237.plt','data/Geolife Trajectories 1.3/Data//133/Trajectory/20110421082008.plt','data/Geolife Trajectories 1.3/Data//133/Trajectory/20110420024807.plt',
...
'''
2 读取所有文件,并拼接到一个DataFrame中
2.1 计算haversine距离的函数
def haversine_distance(lat1, lon1, lat2, lon2):R = 6371 # Earth radius in kilometersdlat = np.radians(lat2 - lat1)dlon = np.radians(lon2 - lon1)a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2) * np.sin(dlon/2)c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))return R * c
2.2 读取文件
所有对应的操作都在单条轨迹处理中已经说明
import pandas as pd
import numpy as np
traj=pd.DataFrame()
traj
num=0
for file in file_dirs:#read data:data = pd.read_csv(file,header=None, skiprows=6,names=['Latitude', 'Longitude', 'Not_Important1', 'Altitude', 'Not_Important2', 'Date', 'Time'])'''merge date and time'''data['Datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])data=data[['Latitude', 'Longitude', 'Altitude', 'Datetime']]'''retain positions in Beijing city'''data=data[(data['Latitude']>B1[0]) & (data['Latitude']<B2[0]) & (data['Longitude']>B1[1]) & (data['Longitude']<B2[1])] '''time gap to 5s, and remain first record every 5s'''data['Datetime_5s']=data['Datetime'].dt.floor('5s')data=data.drop_duplicates(subset=['Datetime_5s'],keep='first')'''remove stopping point'''data['is_moving'] = (data['Latitude'] != data['Latitude'].shift()) | (data['Longitude'] != data['Longitude'].shift())data=data[data['is_moving']==True]data=data[['Latitude','Longitude','Datetime_5s']]'''split trajs without records in 10min into 2 trajs (and update id)'''data['time_diff']=data['Datetime_5s'].diff()data['split_id']=0mask=data['time_diff']>pd.Timedelta(minutes=10)data.loc[mask,'split_id']=1data['split_id']=data['split_id'].cumsum()data['id']=str(num)num+=1data['id']=data['id']+'_'+data['split_id'].astype(str)'''calc each traj's length, filter out short trajs and truncate long ones'''#calculate nearby location's lon and lat gaplat_lon_diff = data.groupby('id',group_keys=False).apply(lambda group: group[['Latitude', 'Longitude']].diff())#calc nearby locationn's distancedistance = lat_lon_diff.apply(lambda row: haversine_distance(row['Latitude'], row['Longitude'], 0, 0), axis=1)data['distance']=distance#calculate each id's accumulated distancedata['accum_dis']=data.groupby('id')['distance'].cumsum()#split those trajs longer than 10km into 2 trajsdata['split_traj_id']=data['accum_dis']//10data['split_traj_id']=data['split_traj_id'].fillna(0)data['split_traj_id']=data['split_traj_id'].astype(int).astype(str)#get new iddata['id']=data['id']+'_'+data['split_traj_id']#remove those shorter than 1kmiid=data.groupby('id')['accum_dis'].max()iid=iid.reset_index(name='distance')iid=iid[iid['distance']>1]data=data[data['id'].isin(iid['id'])]'''filter trajs shorter than 10 records'''iid=data.groupby('id').size()iid=iid.reset_index(name='count')iid=iid[iid['count']>=10]data=data[data['id'].isin(iid['id'])]'''remove 'staypoints''''latlon=pd.DataFrame()latlon['max_lat']=data.groupby('id')['Latitude'].max()latlon['min_lat']=data.groupby('id')['Latitude'].min()latlon['max_lon']=data.groupby('id')['Longitude'].max()latlon['min_lon']=data.groupby('id')['Longitude'].min()latlon['max_dis']=latlon.apply(lambda row: haversine_distance(row['max_lat'],row['max_lon'],row['min_lat'],row['min_lon']),axis=1)latlon=latlon[latlon['max_dis']>=1]data=data[data['id'].isin(latlon.index)]data=data[['Latitude','Longitude','Datetime_5s','id']]#print(data)traj=pd.concat([traj,data])
traj
2.3 保存文件
traj.to_csv('geolife_processed.csv')