164 lines
6.7 KiB
Python
164 lines
6.7 KiB
Python
import pandas as pd
|
||
import os
|
||
import datetime
|
||
import time
|
||
import pymysql
|
||
|
||
"""
|
||
1.读取excel/csv文件,需判定导入数据格式为excel或csv或txt
|
||
2.
|
||
"""
|
||
|
||
def get_dates(files):
|
||
star_time = time.time()
|
||
file_count = range(len(files))
|
||
file_zip = zip(file_count, files)
|
||
names = locals()
|
||
file_list = []
|
||
print('---{}开始读取文件夹中数据---'.format(datetime.datetime.now()))
|
||
for i, j in file_zip:
|
||
if os.path.splitext(j)[-1][1:] == 'xlsx':
|
||
names['df' + str(i)] = pd.read_excel(file_path + '/' + j)
|
||
names['df' + str(i)].drop_duplicates(inplace=True)
|
||
print('---{} {}共计{}条数据---'.format(datetime.datetime.now(), j, len(names['df' + str(i)])))
|
||
elif os.path.splitext(j)[-1][1:] == 'csv':
|
||
names['df' + str(i)] = pd.read_csv(file_path + '/' + j)
|
||
names['df' + str(i)].drop_duplicates(inplace=True)
|
||
print('---{} {}共计{}条数据---'.format(datetime.datetime.now(), j, len(names['df' + str(i)])))
|
||
file_list.append(names['df' + str(i)])
|
||
return file_list
|
||
|
||
def get_drop_dates(): # 这里可以直接调用链接数据库的类
|
||
conn = pymysql.connect(
|
||
host = 'cori0108.top',
|
||
user = 'cori', # 数据库帐号
|
||
password = 'corition0108', # 数据库访问密码
|
||
db = 'ln_database', # 数据库名
|
||
port = 3306
|
||
)
|
||
df_drop = pd.read_sql('select OPENID from `官微推文-基准去除数据`', conn)
|
||
return df_drop
|
||
|
||
# def get_drop_dates(): # 这里可以直接调用链接数据库的类
|
||
# conn = pymysql.connect(
|
||
# host = 'nasofcori.fun',
|
||
# user = 'root', # 数据库帐号
|
||
# password = '123456', # 数据库访问密码
|
||
# db = 'cori_database', # 数据库名
|
||
# port = 3306
|
||
# )
|
||
# df_drop = pd.read_sql('select OPENID from `官微推文-基准去除数据`', conn)
|
||
# return df_drop
|
||
|
||
# def get_drop_dates(): # 这里可以直接调用链接数据库的类
|
||
# conn = pymysql.connect(
|
||
# host = 'localhost',
|
||
# user = 'root', # 数据库帐号
|
||
# password = 'corition0108', # 数据库访问密码
|
||
# db = 'database', # 数据库名
|
||
# port = 3306
|
||
# )
|
||
# df_drop = pd.read_sql('select OPENID from `常规推文数据`', conn)
|
||
# return df_drop
|
||
|
||
# 数据去重
|
||
def duplicates_dates(file_list, df_drop):
|
||
print('---{} 开始合并{}组推文---'.format(datetime.datetime.now(), len(files)))
|
||
df_all = pd.concat(file_list).drop_duplicates() #合并去重
|
||
print('---{} 合并完成!{}组推文合并去重后合计{}条数据---'.format(datetime.datetime.now(), len(files), len(df_all)))
|
||
df_all_filter = df_all[~ df_all['OPENID'].isin(df_drop['OPENID'])] #获取去除基准数据后的df
|
||
print('---{} 去除12月关注名单后后合计剩余{}条数据---'.format(datetime.datetime.now(), len(df_all_filter)))
|
||
return df_all_filter
|
||
|
||
# 合并受限名单
|
||
def duplicates_drop_dates(file_list, df_drop, limit):
|
||
file_list.append(df_drop)
|
||
print('---{} 开始合并常规刨除组推文---'.format(datetime.datetime.now()))
|
||
df_all = pd.concat(file_list).drop_duplicates() # 合并去重
|
||
print('---{} 合并完成!常规刨除推文合并去重后合计{}条数据---'.format(datetime.datetime.now(), len(df_all)))
|
||
|
||
df_all.columns = ['用户识别码']
|
||
if len(df_all) > limit:
|
||
n = len(df_all) // limit + 1
|
||
print(n)
|
||
for i in range(0, n):
|
||
df_save = df_all.iloc[i * limit + 1:(i + 1) * limit + 1, :]
|
||
new_file_name = save_path + '/常规刨除' + str(i) + '_new.xlsx'
|
||
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_save)))
|
||
df_save.to_excel(new_file_name, index=False)
|
||
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
|
||
else:
|
||
new_file_name = save_path + '/' + '常规刨除_new.xlsx'
|
||
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_all)))
|
||
df_all.to_excel(new_file_name, index=False)
|
||
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
|
||
|
||
|
||
|
||
def eachgroup_dates(file_list, df_all_filter, save_path):
|
||
names = locals()
|
||
|
||
j = 0
|
||
for dfs in file_list:
|
||
names['df' + str(j)] = dfs
|
||
j += 1
|
||
|
||
new_file_list =[]
|
||
i = 0
|
||
for file in files:
|
||
print('---{} 开始进行分组去重---'.format(datetime.datetime.now()))
|
||
if i == 0: # 剥离第一组数据
|
||
df_new = pd.merge(names['df' + str(i)], df_all_filter, on='OPENID')
|
||
|
||
elif i > 0 & i < len(files) - 1:
|
||
df_all_filter = df_all_filter[
|
||
~ df_all_filter['OPENID'].isin(names['df' + str(i - 1)]['OPENID'])]
|
||
df_new = pd.merge(names['df' + str(i)], df_all_filter, on='OPENID')
|
||
|
||
else:
|
||
df_new = df_all_filter[~ df_all_filter['OPENID'].isin(names['df' + str(i - 1)]['OPENID'])]
|
||
|
||
print(df_new)
|
||
df_new.rename(columns={'OPENID': '用户识别码'}, inplace=True)
|
||
# df_new.columns = ['用户识别码']
|
||
df_new['备注(选填)'] = datetime.date.today()
|
||
new_file_list.append(df_new)
|
||
i += 1
|
||
|
||
return new_file_list
|
||
|
||
def export_dates(new_file_list, save_path, files, limit):
|
||
j = 0
|
||
for file in new_file_list:
|
||
filename = files[j].split('.')
|
||
if len(file) > limit:
|
||
n = len(file) // limit + 1
|
||
print(n)
|
||
for i in range(0, n + 1):
|
||
df_save = file.iloc[i * limit + 1:(i + 1) * limit + 1, :]
|
||
new_file_name = save_path + '/' + filename[0] + str(i) + '_new.xlsx'
|
||
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_save)))
|
||
df_save.to_excel(new_file_name, index=False)
|
||
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
|
||
else:
|
||
new_file_name = save_path + '/' + filename[0] + '_new.xlsx'
|
||
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(file)))
|
||
file.to_excel(new_file_name, index=False)
|
||
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
|
||
|
||
j += 1
|
||
|
||
file_path = r'D:\常用文件\1-数据分组待处理'
|
||
save_path = r'D:\常用文件\2-数据分组已处理'
|
||
files = os.listdir(file_path)
|
||
limit = 300000
|
||
|
||
file_list = get_dates(files)
|
||
df_drop = get_drop_dates()
|
||
df_all_filter = duplicates_dates(file_list, df_drop)
|
||
new_file_list = eachgroup_dates(file_list, df_all_filter, save_path)
|
||
export_dates(new_file_list, save_path, files, limit)
|
||
duplicates_drop_dates(file_list, df_drop, limit)
|
||
|
||
|