164 lines
6.7 KiB
Python
164 lines
6.7 KiB
Python
|
import pandas as pd
|
|||
|
import os
|
|||
|
import datetime
|
|||
|
import time
|
|||
|
import pymysql
|
|||
|
|
|||
|
"""
|
|||
|
1.读取excel/csv文件,需判定导入数据格式为excel或csv或txt
|
|||
|
2.
|
|||
|
"""
|
|||
|
|
|||
|
def get_dates(files):
|
|||
|
star_time = time.time()
|
|||
|
file_count = range(len(files))
|
|||
|
file_zip = zip(file_count, files)
|
|||
|
names = locals()
|
|||
|
file_list = []
|
|||
|
print('---{}开始读取文件夹中数据---'.format(datetime.datetime.now()))
|
|||
|
for i, j in file_zip:
|
|||
|
if os.path.splitext(j)[-1][1:] == 'xlsx':
|
|||
|
names['df' + str(i)] = pd.read_excel(file_path + '/' + j)
|
|||
|
names['df' + str(i)].drop_duplicates(inplace=True)
|
|||
|
print('---{} {}共计{}条数据---'.format(datetime.datetime.now(), j, len(names['df' + str(i)])))
|
|||
|
elif os.path.splitext(j)[-1][1:] == 'csv':
|
|||
|
names['df' + str(i)] = pd.read_csv(file_path + '/' + j)
|
|||
|
names['df' + str(i)].drop_duplicates(inplace=True)
|
|||
|
print('---{} {}共计{}条数据---'.format(datetime.datetime.now(), j, len(names['df' + str(i)])))
|
|||
|
file_list.append(names['df' + str(i)])
|
|||
|
return file_list
|
|||
|
|
|||
|
def get_drop_dates(): # 这里可以直接调用链接数据库的类
|
|||
|
conn = pymysql.connect(
|
|||
|
host = 'cori0108.top',
|
|||
|
user = 'cori', # 数据库帐号
|
|||
|
password = 'corition0108', # 数据库访问密码
|
|||
|
db = 'ln_database', # 数据库名
|
|||
|
port = 3306
|
|||
|
)
|
|||
|
df_drop = pd.read_sql('select OPENID from `官微推文-基准去除数据`', conn)
|
|||
|
return df_drop
|
|||
|
|
|||
|
# def get_drop_dates(): # 这里可以直接调用链接数据库的类
|
|||
|
# conn = pymysql.connect(
|
|||
|
# host = 'nasofcori.fun',
|
|||
|
# user = 'root', # 数据库帐号
|
|||
|
# password = '123456', # 数据库访问密码
|
|||
|
# db = 'cori_database', # 数据库名
|
|||
|
# port = 3306
|
|||
|
# )
|
|||
|
# df_drop = pd.read_sql('select OPENID from `官微推文-基准去除数据`', conn)
|
|||
|
# return df_drop
|
|||
|
|
|||
|
# def get_drop_dates(): # 这里可以直接调用链接数据库的类
|
|||
|
# conn = pymysql.connect(
|
|||
|
# host = 'localhost',
|
|||
|
# user = 'root', # 数据库帐号
|
|||
|
# password = 'corition0108', # 数据库访问密码
|
|||
|
# db = 'database', # 数据库名
|
|||
|
# port = 3306
|
|||
|
# )
|
|||
|
# df_drop = pd.read_sql('select OPENID from `常规推文数据`', conn)
|
|||
|
# return df_drop
|
|||
|
|
|||
|
# 数据去重
|
|||
|
def duplicates_dates(file_list, df_drop):
|
|||
|
print('---{} 开始合并{}组推文---'.format(datetime.datetime.now(), len(files)))
|
|||
|
df_all = pd.concat(file_list).drop_duplicates() #合并去重
|
|||
|
print('---{} 合并完成!{}组推文合并去重后合计{}条数据---'.format(datetime.datetime.now(), len(files), len(df_all)))
|
|||
|
df_all_filter = df_all[~ df_all['OPENID'].isin(df_drop['OPENID'])] #获取去除基准数据后的df
|
|||
|
print('---{} 去除12月关注名单后后合计剩余{}条数据---'.format(datetime.datetime.now(), len(df_all_filter)))
|
|||
|
return df_all_filter
|
|||
|
|
|||
|
# 合并受限名单
|
|||
|
def duplicates_drop_dates(file_list, df_drop, limit):
|
|||
|
file_list.append(df_drop)
|
|||
|
print('---{} 开始合并常规刨除组推文---'.format(datetime.datetime.now()))
|
|||
|
df_all = pd.concat(file_list).drop_duplicates() # 合并去重
|
|||
|
print('---{} 合并完成!常规刨除推文合并去重后合计{}条数据---'.format(datetime.datetime.now(), len(df_all)))
|
|||
|
|
|||
|
df_all.columns = ['用户识别码']
|
|||
|
if len(df_all) > limit:
|
|||
|
n = len(df_all) // limit + 1
|
|||
|
print(n)
|
|||
|
for i in range(0, n):
|
|||
|
df_save = df_all.iloc[i * limit + 1:(i + 1) * limit + 1, :]
|
|||
|
new_file_name = save_path + '/常规刨除' + str(i) + '_new.xlsx'
|
|||
|
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_save)))
|
|||
|
df_save.to_excel(new_file_name, index=False)
|
|||
|
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
|
|||
|
else:
|
|||
|
new_file_name = save_path + '/' + '常规刨除_new.xlsx'
|
|||
|
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_all)))
|
|||
|
df_all.to_excel(new_file_name, index=False)
|
|||
|
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
|
|||
|
|
|||
|
|
|||
|
|
|||
|
def eachgroup_dates(file_list, df_all_filter, save_path):
|
|||
|
names = locals()
|
|||
|
|
|||
|
j = 0
|
|||
|
for dfs in file_list:
|
|||
|
names['df' + str(j)] = dfs
|
|||
|
j += 1
|
|||
|
|
|||
|
new_file_list =[]
|
|||
|
i = 0
|
|||
|
for file in files:
|
|||
|
print('---{} 开始进行分组去重---'.format(datetime.datetime.now()))
|
|||
|
if i == 0: # 剥离第一组数据
|
|||
|
df_new = pd.merge(names['df' + str(i)], df_all_filter, on='OPENID')
|
|||
|
|
|||
|
elif i > 0 & i < len(files) - 1:
|
|||
|
df_all_filter = df_all_filter[
|
|||
|
~ df_all_filter['OPENID'].isin(names['df' + str(i - 1)]['OPENID'])]
|
|||
|
df_new = pd.merge(names['df' + str(i)], df_all_filter, on='OPENID')
|
|||
|
|
|||
|
else:
|
|||
|
df_new = df_all_filter[~ df_all_filter['OPENID'].isin(names['df' + str(i - 1)]['OPENID'])]
|
|||
|
|
|||
|
print(df_new)
|
|||
|
df_new.rename(columns={'OPENID': '用户识别码'}, inplace=True)
|
|||
|
# df_new.columns = ['用户识别码']
|
|||
|
df_new['备注(选填)'] = datetime.date.today()
|
|||
|
new_file_list.append(df_new)
|
|||
|
i += 1
|
|||
|
|
|||
|
return new_file_list
|
|||
|
|
|||
|
def export_dates(new_file_list, save_path, files, limit):
|
|||
|
j = 0
|
|||
|
for file in new_file_list:
|
|||
|
filename = files[j].split('.')
|
|||
|
if len(file) > limit:
|
|||
|
n = len(file) // limit + 1
|
|||
|
print(n)
|
|||
|
for i in range(0, n + 1):
|
|||
|
df_save = file.iloc[i * limit + 1:(i + 1) * limit + 1, :]
|
|||
|
new_file_name = save_path + '/' + filename[0] + str(i) + '_new.xlsx'
|
|||
|
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(df_save)))
|
|||
|
df_save.to_excel(new_file_name, index=False)
|
|||
|
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
|
|||
|
else:
|
|||
|
new_file_name = save_path + '/' + filename[0] + '_new.xlsx'
|
|||
|
print('---{} 开始导出文件,去重后合计{}条数据'.format(datetime.datetime.now(), len(file)))
|
|||
|
file.to_excel(new_file_name, index=False)
|
|||
|
print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
|
|||
|
|
|||
|
j += 1
|
|||
|
|
|||
|
file_path = r'D:\常用文件\1-数据分组待处理'
|
|||
|
save_path = r'D:\常用文件\2-数据分组已处理'
|
|||
|
files = os.listdir(file_path)
|
|||
|
limit = 300000
|
|||
|
|
|||
|
file_list = get_dates(files)
|
|||
|
df_drop = get_drop_dates()
|
|||
|
df_all_filter = duplicates_dates(file_list, df_drop)
|
|||
|
new_file_list = eachgroup_dates(file_list, df_all_filter, save_path)
|
|||
|
export_dates(new_file_list, save_path, files, limit)
|
|||
|
duplicates_drop_dates(file_list, df_drop, limit)
|
|||
|
|
|||
|
|