cleaned up bar maps

This commit is contained in:
2018-10-03 16:11:31 +02:00
parent ce3c1e9ed1
commit 7da2bfc400
25 changed files with 78972 additions and 8 deletions

172
stimmen/folium.py Normal file
View File

@@ -0,0 +1,172 @@
import folium
from jupyter_progressbar import ProgressBar
from pygeoif.geometry import mapping
from shapely.geometry.geo import shape, box
from stimmen.cbs import data_file
from html import escape
import numpy as np
from stimmen.latitude_longitude import reverse_latitude_longitude
def get_palette(n, no_black=True, no_white=True):
with open(data_file('data', 'glasbey', '{}_colors.txt'.format(n + no_black + no_white))) as f:
return [
'#%02x%02x%02x' % tuple(int(c) for c in line.replace('\n', '').split(','))
for line in f
if not no_black or line != '0,0,0\n'
if not no_white or line != '255,255,255\n'
]
def colored_name(name, color):
return '<span style=\\"color:{}; \\">{}</span>'.format(color, name)
def region_area_cdf(region_shape, resolution=10000):
xmin, ymin, xmax, ymax = region_shape.bounds
shape_area = region_shape.area
spaces = np.linspace(xmin, xmax, resolution + 1)
return np.array([
box(xmin, ymin, xmax_, ymax).intersection(region_shape).area / shape_area
for xmax_ in spaces
])
# Only slightly faster than region_area_cdf.
# def fast_sliced_shape_areas(region_shape, recursions=13):
# results = np.zeros(2 ** recursions)
# xmin, ymin, xmax, ymax = region_shape.bounds
# total = 0
#
# def f(shape_, xmin, ymin, xmax, ymax, recursions, results_):
# nonlocal total
# shape_ = box(xmin, ymin, xmax, ymax).intersection(shape_)
# if recursions == 0:
# assert results_.shape == (1,)
# results_[0] = shape_.area
# total += shape_.area
# else:
# xmiddle = xmin + (xmax - xmin) / 2
# middle_index = len(results_) // 2
# f(shape_, xmin, ymin, xmiddle, ymax, recursions - 1, results_[:middle_index])
# f(shape_, xmiddle, ymin, xmax, ymax, recursions - 1, results_[middle_index:])
#
# f(region_shape, xmin, ymin, xmax, ymax, recursions, results)
# return results / results.sum() * region_shape.area
def area_adjust_boundaries(region_shape, boundaries, region_cdf_cache=None, resolution=10000):
"""Adjust the boundaries from percentage of the width of a shape, to percentage of the area of a shape"""
if region_cdf_cache is None:
region_cdf_cache = region_area_cdf(region_shape, resolution)
elif not isinstance(region_cdf_cache, np.ndarray):
region_cdf_cache = np.array(region_cdf_cache)
return width_adjust_boundaries(
region_shape,
np.abs(region_cdf_cache[None, :] - boundaries[:, None]).argmin(axis=1) / resolution
)
def width_adjust_boundaries(region_shape, boundaries):
xmin, _, xmax, _ = region_shape.bounds
return boundaries * (xmax - xmin) + xmin
def pronunciation_bars(
regions, dataframe,
region_name_property, region_name_column,
group_column='answer_text',
cutoff_percentage=0.05,
normalize_area=True,
progress_bar=False,
):
# all values of group_column that appear at least cutoff_percentage in one of the regions
relevant_groups = {
group
for region_name, region_rows in dataframe.groupby(region_name_column)
for group, aggregation in region_rows.groupby(
group_column).agg({group_column: len}).iterrows()
if aggregation[group_column] >= cutoff_percentage * len(region_rows)
}
group_to_color = dict(zip(relevant_groups, get_palette(len(relevant_groups))))
group_to_color['other'] = '#ccc'
n_other = len(dataframe) - sum(
sum(dataframe[group_column] == group_value)
for group_value in relevant_groups
)
# Each FeatureGroup represents all polygons (one for each region) of the relevant_groups
feature_groups = {
group_value: folium.FeatureGroup(
name=colored_name(
'{value} ({amount})'.format(value=escape(group_value), amount=amount),
color
),
overlay=True
)
for group_value, color in group_to_color.items()
for amount in [
sum(dataframe[group_column] == group_value)
if group_value != 'other' else
n_other
] # alias
}
progress_bar = ProgressBar if progress_bar else lambda x: x
# for each region, create the bar-polygons.
for feature in progress_bar(regions['features']):
region_name = feature['properties'][region_name_property]
region_rows = dataframe[dataframe[region_name_column] == region_name]
region_shape = shape(feature['geometry'])
_, ymin, _, ymax = region_shape.bounds
group_values_occurrence = {
group_value: aggregation[group_column]
for group_value, aggregation in region_rows.groupby(group_column).agg({group_column: len}).iterrows()
if group_value in relevant_groups
}
group_values_occurrence['other'] = len(region_rows) - sum(group_values_occurrence.values())
group_values, group_occurrences = zip(*sorted(
group_values_occurrence.items(),
key=lambda x: (x[0] == 'other', -x[1])
))
group_percentages = np.array(group_occurrences) / len(region_rows)
group_boundaries = np.cumsum((0,) + group_occurrences) / len(region_rows)
if normalize_area:
if '__region_shape_cdf_cache' not in feature['properties']:
feature['properties']['__region_shape_cdf_cache'] = region_area_cdf(region_shape).tolist()
group_boundaries = area_adjust_boundaries(
region_shape, group_boundaries,
region_cdf_cache=feature['properties']['__region_shape_cdf_cache']
)
else:
group_boundaries = width_adjust_boundaries(region_shape, group_boundaries)
for group_value, percentage, count, left_boundary, right_boundary in zip(
group_values,
group_percentages,
group_occurrences,
group_boundaries[:-1], group_boundaries[1:]
):
if count == 0 or left_boundary == right_boundary:
continue
bar_shape = region_shape.intersection(box(left_boundary, ymin, right_boundary, ymax))
if bar_shape.area == 0:
continue
polygon = folium.Polygon(
reverse_latitude_longitude(mapping(bar_shape)['coordinates']),
fill_color=group_to_color[group_value],
fill_opacity=0.8,
color=None,
popup='{} ({}, {: 3d}%)'.format(group_value, count, int(round(100 * percentage)))
)
polygon.add_to(feature_groups[group_value])
return feature_groups

View File

@@ -1,5 +1,6 @@
from pygeoif.geometry import mapping
from shapely.geometry import shape
from shapely.geometry.point import Point
def merge_features(geojson, condition, aggregate={}):
@@ -40,4 +41,40 @@ def merge_features(geojson, condition, aggregate={}):
'geometry': mapping(union),
'properties': properties
})
return geojson
return geojson
def inject_geojson_regions_into_dataframe(
geojson, dataframe,
latitude_column='latitude', longitude_column='longitude',
region_name_property='name',
region_name_column='region'
):
"""adds a region_name_column column to the dataframe with the region name as specified
in the region_name_property of the geojson, by checking which geojson feature geometrically
contains the longitude and latitude of the dataframe's row. This allows for faster cross
reference between the geojson and the dataframe compared to always checking shape-point
containment when cross referencing. Operates in place."""
shapes = {
feature['properties'][region_name_property]: shape(feature['geometry'])
for feature in geojson['features']
}
def get_region_name(point):
nonlocal shapes
for region_name, region_shape in shapes.items():
if region_shape.contains(point):
return region_name
point_to_region_name = {
(latitude, longitude): get_region_name(point)
for latitude, longitude in set(zip(dataframe[latitude_column], dataframe[longitude_column]))
for point in [Point(longitude, latitude)] # alias
}
dataframe[region_name_column] = [
point_to_region_name[(latitude, longitude)]
for latitude, longitude in zip(dataframe[latitude_column], dataframe[longitude_column])
]
return dataframe