import pandas as pd import os import datetime import time import pymysql """ 1.读取excel/csv文件,需判定导入数据格式为excel或csv或txt 2. """ def get_dates(files): star_time = time.time() file_count = range(len(files)) file_zip = zip(file_count, files) names = locals() file_list = [] print('---{}开始读取文件夹中数据---'.format(datetime.datetime.now())) for i, j in file_zip: if os.path.splitext(j)[-1][1:] == 'xlsx': names['df' + str(i)] = pd.read_excel(file_path + '/' + j) names['df' + str(i)].drop_duplicates(inplace=True) print('---{} {}共计{}条数据---'.format(datetime.datetime.now(), j, len(names['df' + str(i)]))) elif os.path.splitext(j)[-1][1:] == 'csv': names['df' + str(i)] = pd.read_csv(file_path + '/' + j) names['df' + str(i)].drop_duplicates(inplace=True) print('---{} {}共计{}条数据---'.format(datetime.datetime.now(), j, len(names['df' + str(i)]))) file_list.append(names['df' + str(i)]) return file_list def get_drop_dates(): # 这里可以直接调用链接数据库的类 conn = pymysql.connect( host = 'cori0108.top', user = 'cori', # 数据库帐号 password = 'corition0108', # 数据库访问密码 db = 'ln_database', # 数据库名 port = 3306 ) df_drop = pd.read_sql('select OPENID from `官微推文-基准去除数据`', conn) return df_drop # def get_drop_dates(): # 这里可以直接调用链接数据库的类 # conn = pymysql.connect( # host = 'nasofcori.fun', # user = 'root', # 数据库帐号 # password = '123456', # 数据库访问密码 # db = 'cori_database', # 数据库名 # port = 3306 # ) # df_drop = pd.read_sql('select OPENID from `官微推文-基准去除数据`', conn) # return df_drop # def get_drop_dates(): # 这里可以直接调用链接数据库的类 # conn = pymysql.connect( # host = 'localhost', # user = 'root', # 数据库帐号 # password = 'corition0108', # 数据库访问密码 # db = 'database', # 数据库名 # port = 3306 # ) # df_drop = pd.read_sql('select OPENID from `常规推文数据`', conn) # return df_drop # 数据去重 def duplicates_dates(file_list, df_drop): print('---{} 开始合并{}组推文---'.format(datetime.datetime.now(), len(files))) df_all = pd.concat(file_list).drop_duplicates() #合并去重 print('---{} 合并完成!{}组推文合并去重后合计{}条数据---'.format(datetime.datetime.now(), len(files), len(df_all))) df_all_filter = df_all[~ df_all['OPENID'].isin(df_drop['OPENID'])] #获取去除基准数据后的df print('---{} 去除12月关注名单后后合计剩余{}条数据---'.format(datetime.datetime.now(), len(df_all_filter))) return df_all_filter # 合并受限名单 def duplicates_drop_dates(file_list, df_drop, limit): file_list.append(df_drop) print('---{} 开始合并常规刨除组推文---'.format(datetime.datetime.now())) df_all = pd.concat(file_list).drop_duplicates() # 合并去重 print('---{} 合并完成!常规刨除推文合并去重后合计{}条数据---'.format(datetime.datetime.now(), len(df_all))) df_all.columns = ['用户识别码'] if len(df_all) > limit: n = len(df_all) // limit + 1 print(n) for i in range(0, n): df_save = df_all.iloc[i * limit + 1:(i + 1) * limit + 1, :] new_file_name = save_path + '/常规刨除' + str(i) + '_new.xlsx' print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_save))) df_save.to_excel(new_file_name, index=False) print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name)) else: new_file_name = save_path + '/' + '常规刨除_new.xlsx' print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_all))) df_all.to_excel(new_file_name, index=False) print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name)) def eachgroup_dates(file_list, df_all_filter, save_path): names = locals() j = 0 for dfs in file_list: names['df' + str(j)] = dfs j += 1 new_file_list =[] i = 0 for file in files: print('---{} 开始进行分组去重---'.format(datetime.datetime.now())) if i == 0: # 剥离第一组数据 df_new = pd.merge(names['df' + str(i)], df_all_filter, on='OPENID') elif i > 0 & i < len(files) - 1: df_all_filter = df_all_filter[ ~ df_all_filter['OPENID'].isin(names['df' + str(i - 1)]['OPENID'])] df_new = pd.merge(names['df' + str(i)], df_all_filter, on='OPENID') else: df_new = df_all_filter[~ df_all_filter['OPENID'].isin(names['df' + str(i - 1)]['OPENID'])] print(df_new) df_new.rename(columns={'OPENID': '用户识别码'}, inplace=True) # df_new.columns = ['用户识别码'] df_new['备注(选填)'] = datetime.date.today() new_file_list.append(df_new) i += 1 return new_file_list def export_dates(new_file_list, save_path, files, limit): j = 0 for file in new_file_list: filename = files[j].split('.') if len(file) > limit: n = len(file) // limit + 1 print(n) for i in range(0, n + 1): df_save = file.iloc[i * limit + 1:(i + 1) * limit + 1, :] new_file_name = save_path + '/' + filename[0] + str(i) + '_new.xlsx' print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_save))) df_save.to_excel(new_file_name, index=False) print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name)) else: new_file_name = save_path + '/' + filename[0] + '_new.xlsx' print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(file))) file.to_excel(new_file_name, index=False) print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name)) j += 1 file_path = r'D:\常用文件\1-数据分组待处理' save_path = r'D:\常用文件\2-数据分组已处理' files = os.listdir(file_path) limit = 300000 file_list = get_dates(files) df_drop = get_drop_dates() df_all_filter = duplicates_dates(file_list, df_drop) new_file_list = eachgroup_dates(file_list, df_all_filter, save_path) export_dates(new_file_list, save_path, files, limit) duplicates_drop_dates(file_list, df_drop, limit)