【Python与经管学术】关键词词频统计与输出(按年度统计)

import pandas as pd
from collections import defaultdict
import datetime

def read_excel(file_path):
    df = pd.read_excel(file_path)
    return df

def keyword_filter_and_count_by_year(df, company_column, date_column, text_column, keywords):
    df[date_column] = pd.to_datetime(df[date_column]).dt.to_period('Y')
    grouped_data = df.groupby([company_column, date_column])

    summary_data = []

    for (company, year), group in grouped_data:
        num_filtered_rows = 0
        keyword_counter = defaultdict(int)

        for index, row in group.iterrows():
            text = row[text_column]
            row_keywords = set()
            for keyword in keywords:
                if keyword in text:
                    row_keywords.add(keyword)
                    keyword_counter[keyword] += 1

            if row_keywords:
                num_filtered_rows += 1

        keyword_counts = {k: keyword_counter[k] for k in keywords}
        summary_data.append((company, year, num_filtered_rows, *keyword_counts.values()))

    return summary_data

def write_to_excel(summary_data, output_file_path, keywords):
    columns = ['证券代码', '年份', '包含关键词的提问个数'] + keywords
    summary_df = pd.DataFrame(summary_data, columns=columns)
    summary_df.to_excel(output_file_path, index=False)

def main():
    input_file_path = "   "
    output_file_path = '   '
    company_column = 'Symbol'
    date_column = 'QuestionDate'
    text_column = 'QuestionContent'
    keywords = [    ]

    df = read_excel(input_file_path)
    summary_data = keyword_filter_and_count_by_year(df, company_column, date_column, text_column, keywords)
    write_to_excel(summary_data, output_file_path, keywords)

if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/m0_56120502/article/details/130445069