html_and_vue/分组数据处理.py

164 lines
6.7 KiB
Python
Raw Normal View History

2024-04-29 09:09:17 +00:00
import pandas as pd
import os
import datetime
import time
import pymysql
"""
1.读取excel/csv文件需判定导入数据格式为excel或csv或txt
2.
"""
def get_dates(files):
star_time = time.time()
file_count = range(len(files))
file_zip = zip(file_count, files)
names = locals()
file_list = []
print('---{}开始读取文件夹中数据---'.format(datetime.datetime.now()))
for i, j in file_zip:
if os.path.splitext(j)[-1][1:] == 'xlsx':
names['df' + str(i)] = pd.read_excel(file_path + '/' + j)
names['df' + str(i)].drop_duplicates(inplace=True)
print('---{} {}共计{}条数据---'.format(datetime.datetime.now(), j, len(names['df' + str(i)])))
elif os.path.splitext(j)[-1][1:] == 'csv':
names['df' + str(i)] = pd.read_csv(file_path + '/' + j)
names['df' + str(i)].drop_duplicates(inplace=True)
print('---{} {}共计{}条数据---'.format(datetime.datetime.now(), j, len(names['df' + str(i)])))
file_list.append(names['df' + str(i)])
return file_list
def get_drop_dates(): # 这里可以直接调用链接数据库的类
conn = pymysql.connect(
host = 'cori0108.top',
user = 'cori', # 数据库帐号
password = 'corition0108', # 数据库访问密码
db = 'ln_database', # 数据库名
port = 3306
)
df_drop = pd.read_sql('select OPENID from `官微推文-基准去除数据`', conn)
return df_drop
# def get_drop_dates(): # 这里可以直接调用链接数据库的类
# conn = pymysql.connect(
# host = 'nasofcori.fun',
# user = 'root', # 数据库帐号
# password = '123456', # 数据库访问密码
# db = 'cori_database', # 数据库名
# port = 3306
# )
# df_drop = pd.read_sql('select OPENID from `官微推文-基准去除数据`', conn)
# return df_drop
# def get_drop_dates(): # 这里可以直接调用链接数据库的类
# conn = pymysql.connect(
# host = 'localhost',
# user = 'root', # 数据库帐号
# password = 'corition0108', # 数据库访问密码
# db = 'database', # 数据库名
# port = 3306
# )
# df_drop = pd.read_sql('select OPENID from `常规推文数据`', conn)
# return df_drop
# 数据去重
def duplicates_dates(file_list, df_drop):
print('---{} 开始合并{}组推文---'.format(datetime.datetime.now(), len(files)))
df_all = pd.concat(file_list).drop_duplicates() #合并去重
print('---{} 合并完成!{}组推文合并去重后合计{}条数据---'.format(datetime.datetime.now(), len(files), len(df_all)))
df_all_filter = df_all[~ df_all['OPENID'].isin(df_drop['OPENID'])] #获取去除基准数据后的df
print('---{} 去除12月关注名单后后合计剩余{}条数据---'.format(datetime.datetime.now(), len(df_all_filter)))
return df_all_filter
# 合并受限名单
def duplicates_drop_dates(file_list, df_drop, limit):
file_list.append(df_drop)
print('---{} 开始合并常规刨除组推文---'.format(datetime.datetime.now()))
df_all = pd.concat(file_list).drop_duplicates() # 合并去重
print('---{} 合并完成!常规刨除推文合并去重后合计{}条数据---'.format(datetime.datetime.now(), len(df_all)))
df_all.columns = ['用户识别码']
if len(df_all) > limit:
n = len(df_all) // limit + 1
print(n)
for i in range(0, n):
df_save = df_all.iloc[i * limit + 1:(i + 1) * limit + 1, :]
new_file_name = save_path + '/常规刨除' + str(i) + '_new.xlsx'
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_save)))
df_save.to_excel(new_file_name, index=False)
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
else:
new_file_name = save_path + '/' + '常规刨除_new.xlsx'
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_all)))
df_all.to_excel(new_file_name, index=False)
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
def eachgroup_dates(file_list, df_all_filter, save_path):
names = locals()
j = 0
for dfs in file_list:
names['df' + str(j)] = dfs
j += 1
new_file_list =[]
i = 0
for file in files:
print('---{} 开始进行分组去重---'.format(datetime.datetime.now()))
if i == 0: # 剥离第一组数据
df_new = pd.merge(names['df' + str(i)], df_all_filter, on='OPENID')
elif i > 0 & i < len(files) - 1:
df_all_filter = df_all_filter[
~ df_all_filter['OPENID'].isin(names['df' + str(i - 1)]['OPENID'])]
df_new = pd.merge(names['df' + str(i)], df_all_filter, on='OPENID')
else:
df_new = df_all_filter[~ df_all_filter['OPENID'].isin(names['df' + str(i - 1)]['OPENID'])]
print(df_new)
df_new.rename(columns={'OPENID': '用户识别码'}, inplace=True)
# df_new.columns = ['用户识别码']
df_new['备注(选填)'] = datetime.date.today()
new_file_list.append(df_new)
i += 1
return new_file_list
def export_dates(new_file_list, save_path, files, limit):
j = 0
for file in new_file_list:
filename = files[j].split('.')
if len(file) > limit:
n = len(file) // limit + 1
print(n)
for i in range(0, n + 1):
df_save = file.iloc[i * limit + 1:(i + 1) * limit + 1, :]
new_file_name = save_path + '/' + filename[0] + str(i) + '_new.xlsx'
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_save)))
df_save.to_excel(new_file_name, index=False)
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
else:
new_file_name = save_path + '/' + filename[0] + '_new.xlsx'
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(file)))
file.to_excel(new_file_name, index=False)
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
j += 1
file_path = r'D:\常用文件\1-数据分组待处理'
save_path = r'D:\常用文件\2-数据分组已处理'
files = os.listdir(file_path)
limit = 300000
file_list = get_dates(files)
df_drop = get_drop_dates()
df_all_filter = duplicates_dates(file_list, df_drop)
new_file_list = eachgroup_dates(file_list, df_all_filter, save_path)
export_dates(new_file_list, save_path, files, limit)
duplicates_drop_dates(file_list, df_drop, limit)