html_and_vue/分组数据处理.py

164 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pandas as pd
import os
import datetime
import time
import pymysql
"""
1.读取excel/csv文件需判定导入数据格式为excel或csv或txt
2.
"""
def get_dates(files):
star_time = time.time()
file_count = range(len(files))
file_zip = zip(file_count, files)
names = locals()
file_list = []
print('---{}开始读取文件夹中数据---'.format(datetime.datetime.now()))
for i, j in file_zip:
if os.path.splitext(j)[-1][1:] == 'xlsx':
names['df' + str(i)] = pd.read_excel(file_path + '/' + j)
names['df' + str(i)].drop_duplicates(inplace=True)
print('---{} {}共计{}条数据---'.format(datetime.datetime.now(), j, len(names['df' + str(i)])))
elif os.path.splitext(j)[-1][1:] == 'csv':
names['df' + str(i)] = pd.read_csv(file_path + '/' + j)
names['df' + str(i)].drop_duplicates(inplace=True)
print('---{} {}共计{}条数据---'.format(datetime.datetime.now(), j, len(names['df' + str(i)])))
file_list.append(names['df' + str(i)])
return file_list
def get_drop_dates(): # 这里可以直接调用链接数据库的类
conn = pymysql.connect(
host = 'cori0108.top',
user = 'cori', # 数据库帐号
password = 'corition0108', # 数据库访问密码
db = 'ln_database', # 数据库名
port = 3306
)
df_drop = pd.read_sql('select OPENID from `官微推文-基准去除数据`', conn)
return df_drop
# def get_drop_dates(): # 这里可以直接调用链接数据库的类
# conn = pymysql.connect(
# host = 'nasofcori.fun',
# user = 'root', # 数据库帐号
# password = '123456', # 数据库访问密码
# db = 'cori_database', # 数据库名
# port = 3306
# )
# df_drop = pd.read_sql('select OPENID from `官微推文-基准去除数据`', conn)
# return df_drop
# def get_drop_dates(): # 这里可以直接调用链接数据库的类
# conn = pymysql.connect(
# host = 'localhost',
# user = 'root', # 数据库帐号
# password = 'corition0108', # 数据库访问密码
# db = 'database', # 数据库名
# port = 3306
# )
# df_drop = pd.read_sql('select OPENID from `常规推文数据`', conn)
# return df_drop
# 数据去重
def duplicates_dates(file_list, df_drop):
print('---{} 开始合并{}组推文---'.format(datetime.datetime.now(), len(files)))
df_all = pd.concat(file_list).drop_duplicates() #合并去重
print('---{} 合并完成!{}组推文合并去重后合计{}条数据---'.format(datetime.datetime.now(), len(files), len(df_all)))
df_all_filter = df_all[~ df_all['OPENID'].isin(df_drop['OPENID'])] #获取去除基准数据后的df
print('---{} 去除12月关注名单后后合计剩余{}条数据---'.format(datetime.datetime.now(), len(df_all_filter)))
return df_all_filter
# 合并受限名单
def duplicates_drop_dates(file_list, df_drop, limit):
file_list.append(df_drop)
print('---{} 开始合并常规刨除组推文---'.format(datetime.datetime.now()))
df_all = pd.concat(file_list).drop_duplicates() # 合并去重
print('---{} 合并完成!常规刨除推文合并去重后合计{}条数据---'.format(datetime.datetime.now(), len(df_all)))
df_all.columns = ['用户识别码']
if len(df_all) > limit:
n = len(df_all) // limit + 1
print(n)
for i in range(0, n):
df_save = df_all.iloc[i * limit + 1:(i + 1) * limit + 1, :]
new_file_name = save_path + '/常规刨除' + str(i) + '_new.xlsx'
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_save)))
df_save.to_excel(new_file_name, index=False)
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
else:
new_file_name = save_path + '/' + '常规刨除_new.xlsx'
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_all)))
df_all.to_excel(new_file_name, index=False)
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
def eachgroup_dates(file_list, df_all_filter, save_path):
names = locals()
j = 0
for dfs in file_list:
names['df' + str(j)] = dfs
j += 1
new_file_list =[]
i = 0
for file in files:
print('---{} 开始进行分组去重---'.format(datetime.datetime.now()))
if i == 0: # 剥离第一组数据
df_new = pd.merge(names['df' + str(i)], df_all_filter, on='OPENID')
elif i > 0 & i < len(files) - 1:
df_all_filter = df_all_filter[
~ df_all_filter['OPENID'].isin(names['df' + str(i - 1)]['OPENID'])]
df_new = pd.merge(names['df' + str(i)], df_all_filter, on='OPENID')
else:
df_new = df_all_filter[~ df_all_filter['OPENID'].isin(names['df' + str(i - 1)]['OPENID'])]
print(df_new)
df_new.rename(columns={'OPENID': '用户识别码'}, inplace=True)
# df_new.columns = ['用户识别码']
df_new['备注(选填)'] = datetime.date.today()
new_file_list.append(df_new)
i += 1
return new_file_list
def export_dates(new_file_list, save_path, files, limit):
j = 0
for file in new_file_list:
filename = files[j].split('.')
if len(file) > limit:
n = len(file) // limit + 1
print(n)
for i in range(0, n + 1):
df_save = file.iloc[i * limit + 1:(i + 1) * limit + 1, :]
new_file_name = save_path + '/' + filename[0] + str(i) + '_new.xlsx'
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_save)))
df_save.to_excel(new_file_name, index=False)
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
else:
new_file_name = save_path + '/' + filename[0] + '_new.xlsx'
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(file)))
file.to_excel(new_file_name, index=False)
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
j += 1
file_path = r'D:\常用文件\1-数据分组待处理'
save_path = r'D:\常用文件\2-数据分组已处理'
files = os.listdir(file_path)
limit = 300000
file_list = get_dates(files)
df_drop = get_drop_dates()
df_all_filter = duplicates_dates(file_list, df_drop)
new_file_list = eachgroup_dates(file_list, df_all_filter, save_path)
export_dates(new_file_list, save_path, files, limit)
duplicates_drop_dates(file_list, df_drop, limit)