html_and_vue/分组数据处理.py

import pandas as pd
import os
import datetime
import time
import pymysql

"""
1.读取excel/csv文件，需判定导入数据格式为excel或csv或txt
2.
"""

def get_dates(files):
    star_time = time.time()
    file_count = range(len(files))
    file_zip = zip(file_count, files)
    names = locals()
    file_list = []
    print('---{}开始读取文件夹中数据---'.format(datetime.datetime.now()))
    for i, j in file_zip:
        if os.path.splitext(j)[-1][1:] == 'xlsx':
            names['df' + str(i)] = pd.read_excel(file_path + '/' + j)
            names['df' + str(i)].drop_duplicates(inplace=True)
            print('---{} {}共计{}条数据---'.format(datetime.datetime.now(), j, len(names['df' + str(i)])))
        elif os.path.splitext(j)[-1][1:] == 'csv':
            names['df' + str(i)] = pd.read_csv(file_path + '/' + j)
            names['df' + str(i)].drop_duplicates(inplace=True)
            print('---{} {}共计{}条数据---'.format(datetime.datetime.now(), j, len(names['df' + str(i)])))
        file_list.append(names['df' + str(i)])
    return file_list

def get_drop_dates():  # 这里可以直接调用链接数据库的类
    conn = pymysql.connect(
        host = 'cori0108.top',
        user = 'cori',  # 数据库帐号
        password = 'corition0108',  # 数据库访问密码
        db = 'ln_database',  # 数据库名
        port = 3306
    )
    df_drop = pd.read_sql('select OPENID from `官微推文-基准去除数据`', conn)
    return df_drop

# def get_drop_dates():  # 这里可以直接调用链接数据库的类
#     conn = pymysql.connect(
#         host = 'nasofcori.fun',
#         user = 'root',  # 数据库帐号
#         password = '123456',  # 数据库访问密码
#         db = 'cori_database',  # 数据库名
#         port = 3306
#     )
#     df_drop = pd.read_sql('select OPENID from `官微推文-基准去除数据`', conn)
#     return df_drop

# def get_drop_dates():  # 这里可以直接调用链接数据库的类
#     conn = pymysql.connect(
#         host = 'localhost',
#         user = 'root',  # 数据库帐号
#         password = 'corition0108',  # 数据库访问密码
#         db = 'database',  # 数据库名
#         port = 3306
#     )
#     df_drop = pd.read_sql('select OPENID from `常规推文数据`', conn)
#     return df_drop

# 数据去重
def duplicates_dates(file_list, df_drop):
    print('---{} 开始合并{}组推文---'.format(datetime.datetime.now(), len(files)))
    df_all = pd.concat(file_list).drop_duplicates()  #合并去重
    print('---{} 合并完成！{}组推文合并去重后合计{}条数据---'.format(datetime.datetime.now(), len(files), len(df_all)))
    df_all_filter = df_all[~ df_all['OPENID'].isin(df_drop['OPENID'])]  #获取去除基准数据后的df
    print('---{} 去除12月关注名单后后合计剩余{}条数据---'.format(datetime.datetime.now(), len(df_all_filter)))
    return df_all_filter

# 合并受限名单
def duplicates_drop_dates(file_list, df_drop, limit):
    file_list.append(df_drop)
    print('---{} 开始合并常规刨除组推文---'.format(datetime.datetime.now()))
    df_all = pd.concat(file_list).drop_duplicates()  # 合并去重
    print('---{} 合并完成！常规刨除推文合并去重后合计{}条数据---'.format(datetime.datetime.now(), len(df_all)))

    df_all.columns = ['用户识别码']
    if len(df_all) > limit:
        n = len(df_all) // limit + 1
        print(n)
        for i in range(0, n):
            df_save = df_all.iloc[i * limit + 1:(i + 1) * limit + 1, :]
            new_file_name = save_path + '/常规刨除' + str(i) + '_new.xlsx'
            print('---{} 开始导出文件，去重后合计{}条数据'.format(datetime.datetime.now(), len(df_save)))
            df_save.to_excel(new_file_name, index=False)
            print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
    else:
        new_file_name = save_path + '/' + '常规刨除_new.xlsx'
        print('---{} 开始导出文件，去重后合计{}条数据'.format(datetime.datetime.now(), len(df_all)))
        df_all.to_excel(new_file_name, index=False)
        print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))


def eachgroup_dates(file_list, df_all_filter, save_path):
    names = locals()

    j = 0
    for dfs in file_list:
        names['df' + str(j)] = dfs
        j += 1

    new_file_list =[]
    i = 0
    for file in files:
        print('---{} 开始进行分组去重---'.format(datetime.datetime.now()))
        if i == 0:  # 剥离第一组数据
            df_new = pd.merge(names['df' + str(i)], df_all_filter, on='OPENID')

        elif i > 0 & i < len(files) - 1:
            df_all_filter = df_all_filter[
                ~ df_all_filter['OPENID'].isin(names['df' + str(i - 1)]['OPENID'])]
            df_new = pd.merge(names['df' + str(i)], df_all_filter, on='OPENID')

        else:
            df_new = df_all_filter[~ df_all_filter['OPENID'].isin(names['df' + str(i - 1)]['OPENID'])]

        print(df_new)
        df_new.rename(columns={'OPENID': '用户识别码'}, inplace=True)
        # df_new.columns = ['用户识别码']
        df_new['备注（选填）'] = datetime.date.today()
        new_file_list.append(df_new)
        i += 1

    return new_file_list

def export_dates(new_file_list, save_path, files, limit):
    j = 0
    for file in new_file_list:
        filename = files[j].split('.')
        if len(file) > limit:
            n = len(file) // limit + 1
            print(n)
            for i in range(0, n + 1):
                df_save = file.iloc[i * limit + 1:(i + 1) * limit + 1, :]
                new_file_name = save_path + '/' + filename[0] + str(i) + '_new.xlsx'
                print('---{} 开始导出文件，去重后合计{}条数据'.format(datetime.datetime.now(), len(df_save)))
                df_save.to_excel(new_file_name, index=False)
                print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))
        else:
            new_file_name = save_path + '/' + filename[0] + '_new.xlsx'
            print('---{} 开始导出文件，去重后合计{}条数据'.format(datetime.datetime.now(), len(file)))
            file.to_excel(new_file_name, index=False)
            print('---{} 已成功导出{}文件--'.format(datetime.datetime.now(), new_file_name))

        j += 1

file_path = r'D:\常用文件\1-数据分组待处理'
save_path = r'D:\常用文件\2-数据分组已处理'
files = os.listdir(file_path)
limit = 300000

file_list = get_dates(files)
df_drop = get_drop_dates()
df_all_filter = duplicates_dates(file_list, df_drop)
new_file_list = eachgroup_dates(file_list, df_all_filter, save_path)
export_dates(new_file_list, save_path, files, limit)
duplicates_drop_dates(file_list, df_drop, limit)