Dagum Gini Decomposition,Dagum基尼系数分解的python实例
背景
基尼系数(Gini index or Gini Coefficient),是国际上通用的、用以衡量一个国家内或地区内总体居民收入差距的常用指标之一。
但是其全局性的特征也意味着缺少了局部的收入(或发展等)的相对区域化的信息。在97年Camilo Dagnum介绍了一种对Gini系数进行区域化分解的方法,用以将基尼系数这个相对宏观的数值描述分解为子区域内差距、子区域间差距等相对细节的描述以及超变密度。而这样的“总体的结果由局部的影响综合而来”的过程也是符合直觉的。
因本文作者非经济学学生,故不对超变密度的意义做深究。
分解过程
基尼系数:
代码实例
由于测试输入涉及未发表数据,故暂不公开,目前仅可通过邮件索要包含人为错误信息的测试输入。
'''
Dagum Gini coefficient decomposition.
@author: Xian Yuehui, <xianyuehui@stu.xjtu.edu.cn>
@date: 2022.07.10
@licence: GPL clause
'''
from itertools import permutations, product, combinations
from typing import Dict, List
import numpy as np
# return (Dict[year_id, Dict[subgroup_id, List[gini_coeff]], subgroup_ids_set)
def read_data(file_path: str = 'gini.txt') -> Dict[int, Dict[int, List[float]]]:
with open(file_path, 'rt') as f:
_ = f.readline().strip().split('\t') # provinces
subgroup_ids = list(map(int, f.readline().strip().split('\t')))
data = {}
while True:
year_data_str = f.readline()
if year_data_str == '':
break
year_data_raw = list(map(float, year_data_str.strip().split('\t')))
year_id = int(year_data_raw[0])
# allocate memory
data.update({year_id: {}})
for subgroup_id in set(subgroup_ids):
data[year_id].update({subgroup_id: []})
# read year data
for idx in range(len(year_data_raw) - 1):
subgroup_id = subgroup_ids[idx]
data[year_id][subgroup_id].append(year_data_raw[idx + 1])
return data, set(subgroup_ids)
# return sum of abs_diff of permuation of 2 lists
def sum_abs_product(a: List[float], b:List[float]) -> float:
return sum([abs(item[0] - item[-1]) for item in list(product(a, b))])
# return Gini coefficient of two groups (lists)
def Gini(a: List[float], b:List[float]) -> float:
n_a, n_b = len(a), len(b)
avg_a, avg_b = sum(a) / n_a, sum(b) / n_b
return sum_abs_product(a, b) / ((avg_a + avg_b) * n_a * n_b)
if __name__ == '__main__':
result_save_path = 'dagum_gini_coef_decomp.txt'
data, subgroup_ids = read_data()
# for debug
# year = list(data.keys())[0]
# data = {year: data[year]}
# formating
sub_group_ids = sorted(subgroup_ids)
G_idx_list = []
G_idx_list.extend([(idx, idx) for idx in sub_group_ids])
G_idx_list.extend([idxs for idxs in combinations(sub_group_ids, 2)])
D_idx_list = [idxs for idxs in permutations(sub_group_ids, 2)]
with open(result_save_path, 'wt') as f:
# result file header
# macro
f.write('#year\tGini\tG_nb\tG_w\tG_t\tG_nb+G_w+G_t')
f.write('\tG_nb/Gini\tG_w/Gini\tG_t/Gini')
# micro
for idxs in G_idx_list:
f.write('\tG_{}_{}'.format(idxs[0], idxs[1]))
for idxs in D_idx_list:
f.write('\tD_{}_{}'.format(idxs[0], idxs[1]))
f.write('\n')
for year, year_data in data.items():
# iteration body
sort_list = []
for k_j, v_j in year_data.items():
sort_list.append((k_j, sum(v_j) / len(v_j)))
sort_list.sort(key = lambda x: x[-1], reverse = True)
sorted_keys = [item[0] for item in sort_list]
year_data_list = []
for key in sorted_keys:
v_j = year_data[key]
year_data_list.extend(v_j)
# G: Gini
G = Gini(year_data_list, year_data_list)
# G_jh
G_jh = {}
for key_j, key_h in product(sorted_keys, sorted_keys):
G_jh_temp = Gini(year_data[key_j], year_data[key_h])
if key_j not in G_jh.keys():
G_jh.update({key_j: {}})
G_jh[key_j].update({key_h: G_jh_temp})
# P_j, S_j
P_j, S_j = {}, {}
Y_avg = sum(year_data_list) / len(year_data_list)
for k_j, v_j in year_data.items():
P_j.update({k_j: len(v_j) / len(year_data_list)})
y_j_avg = sum(v_j) / len(v_j)
S_j.update({k_j: P_j[k_j] * y_j_avg / Y_avg})
# D_jh
D_jh = {}
for key_j in sorted_keys:
D_jh.update({key_j: {}})
for key_j, key_h in product(sorted_keys, sorted_keys):
v_j, v_h = year_data[key_j], year_data[key_h]
M_jh_tmp = np.array([item[0] - item[-1] if item[0] - item[-1] > 0 else 0 for item in list(product(v_j, v_h))]).mean()
N_jh_tmp = np.array([item[-1] - item[0] if item[-1] - item[0] > 0 else 0 for item in list(product(v_j, v_h))]).mean()
D_jh_tmp = (M_jh_tmp - N_jh_tmp) / (M_jh_tmp + N_jh_tmp)
D_jh[key_j].update({key_h: D_jh_tmp})
# G_w
G_w = sum([G_jh[idx][idx] * P_j[idx] * S_j[idx] for idx in year_data.keys()])
# G_nb, G_t
G_nb, G_t = 0., 0.
for idx in range(0, len(sorted_keys)):
j = sorted_keys[idx]
for h in sorted_keys[idx + 1:]:
G_nb += G_jh[j][h] * D_jh[j][h] * (P_j[j] * S_j[h] + P_j[h] * S_j[j])
G_t += G_jh[j][h] * (P_j[j] * S_j[h] + P_j[h] * S_j[j]) * (1 - D_jh[j][h])
f.write('{}\t{}\t{}\t{}\t{}\t{}'.format(year, G, G_nb, G_w, G_t, G_nb + G_w + G_t))
f.write('\t{}\t{}\t{}'.format(G_nb / G, G_w / G, G_t / G))
for idxs in G_idx_list:
f.write('\t{}'.format(G_jh[idxs[0]][idxs[1]]))
for idxs in D_idx_list:
f.write('\t{}'.format(D_jh[idxs[0]][idxs[1]]))
f.write('\n')
# for debug
# print('{}'.format(year).center(50, '-'))
# print('Gini: ', G)
# print('G_nb: ', G_nb)
# print('G_w: ', G_w)
# print('G_t: ', G_t)
# print('G_nb + G_w + G_t:', G_nb + G_w + G_t)
版权声明
禁止未进行原文章地址声明的文章转载。使用本文代码时请在适当位置给予感谢声明。
参考资料
1. Dagum C. A new approach to the decomposition of the Gini income inequality ratio[M]//Income Inequality, Poverty, and Economic Welfare. Physica-Verlag HD, 1998: 47-63.
2201_76048201: 博主您好,给您发邮件啦,期待您的回复~
YSHsihan: 博主你好,给您发邮件啦,期待您的回复!
wsxyh1071652438: 应该昨天就给你发过去了
pai__da_xing: 博主你好,给您发邮件啦,期待您的回复!
今天做数学题了吗966: 已经收到 非常感谢!