在SAS中,使用PROC MEANS过程配合WEIGHT和OUTPUT等语句可以很方便地计算不同组合下的N, SUMWGT, SUM, MEAN, MIN, MAX, STD等统计量。Python是免费开源的编程语言,配合pandas库可以完成很多的统计计算和科学数据分析工作。
那么,python中有没有类似的代码来完成SAS中的PROC MEANS任务呢?参考Stack Overflow上的一篇问答帖子,我测试并重新整理成了以下完整的代码以及用法示例。读者可以很方便地在FUNC_GENS部分加入更多自己的函数。
#!/usr/bin/env python3 # coding: utf-8 import pandas as pd def proc_means(data, var_names_in, var_names_out, var_functions, var_name_weight = None, var_names_group = None): """ An implementation of SAS PROC MEANS in python See: Creating a partial SAS PROC SUMMARY replacement in Python/Pandas http://stackoverflow.com/questions/29926940/creating-a-partial-sas-proc-summary-replacement-in-python-pandas Arguments: data: A pandas dataframe. var_names_in: A list of variable names in the dataframe to be used for calculation. var_names_out: A list of variable names to be returned after the calculation. var_functions: A list of functions, as shown in FUNC_GENS. var_name_weight: The weight factor. var_names_group: A list of variable names to be used for groupby. """ FUNC_GENS = { "mean" : lambda var_name_in, var_name_weight: lambda x: x[var_name_in].mean(), "max" : lambda var_name_in, var_name_weight: lambda x: x[var_name_in].max(), "min" : lambda var_name_in, var_name_weight: lambda x: x[var_name_in].min(), "sumwgt": lambda var_name_in, var_name_weight: lambda x: x[var_name_weight].sum(), "count" : lambda var_name_in, var_name_weight: lambda x: x[var_name_in].count() , "sum": lambda var_name_in, var_name_weight: lambda x: (x[var_name_in] * x[var_name_weight]).sum(), "wmean" : lambda var_name_in, var_name_weight: lambda x: (x[var_name_in] * x[var_name_weight]).sum() / x[var_name_weight].sum(), "std" : lambda var_name_in, var_name_weight: lambda x: x[var_name_in].std(), } result = pd.DataFrame() if var_names_group is None: grouped = data.groupby(lambda x: True) else: grouped = data.groupby(var_names_group) for var_name_in, var_name_out, var_function in \ zip(var_names_in,var_names_out,var_functions): func_gen = FUNC_GENS[var_function] func = func_gen(var_name_in, var_name_weight) result[var_name_out] = grouped.apply(func) return result # Usage example: df = pd.DataFrame( {'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : [1,2,3,4,5,6,7,8], 'D' : [100, 200, 300, 400, 500, 600, 700, 800] }) print(df) ''' A B C D 0 foo one 1 100 1 bar one 2 200 2 foo two 3 300 3 bar three 4 400 4 foo two 5 500 5 bar two 6 600 6 foo one 7 700 7 foo three 8 800 ''' result = proc_means( data = df, var_names_in = ['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D'], var_names_out = ['D_mean', 'D_max', 'D_min', 'D_sum', 'D_count', 'D_sumwgt', 'D_wmean', 'D_std'], var_functions=['mean','max', 'min', 'sum', 'count', 'sumwgt', 'wmean', 'std'] , var_name_weight="C", var_names_group = ['A', 'B']) print(result) ''' D_mean D_max D_min D_sum D_count D_sumwgt D_wmean D_std A B bar one 200 200 200 400 1 2 200 NaN three 400 400 400 1600 1 4 400 NaN two 600 600 600 3600 1 6 600 NaN foo one 400 700 100 5000 2 8 625 424.264069 three 800 800 800 6400 1 8 800 NaN two 400 500 300 3400 2 8 425 141.421356 '''
Leave a Reply