from datetime import timedelta
import glob
import inspect
import re
from collections import defaultdict
from itertools import izip
import logging
import os
import random
import time
from tempfile import NamedTemporaryFile
import numpy as np
import pandas as pd
import rasterio
from fiona.crs import from_epsg
from geopandas import GeoDataFrame, GeoSeries
from osgeo import gdal
from rasterio import features
from rasterio.io import MemoryFile
from rasterio.fill import fillnodata
from rasterio.mask import mask as rio_mask
from rasterio.warp import reproject, Resampling
# from rasterio.warp import aligned_target
from rasterio.windows import get_data_window, intersection, from_bounds
from scipy.cluster.vq import *
from scipy import stats
from shapely.geometry import LineString, Point, mapping
from shapely.ops import linemerge
from . import crs as pyprecag_crs
from . import TEMPDIR, config
from .table_ops import calculate_strip_stats
from .convert import convert_polygon_to_grid, convert_grid_to_vesper, numeric_pixelsize_to_string, \
convert_polygon_feature_to_raster, drop_z, deg_to_8_compass_pts, point_to_point_bearing, \
text_rotation
from .describe import save_geopandas_tofile, VectorDescribe, get_dataframe_encoding
from .errors import GeometryError, SpatialReferenceError
from .vector_ops import thin_point_by_distance
from .raster_ops import focal_statistics, save_in_memory_raster_to_file, reproject_image, \
calculate_image_indices, stack_and_clip_rasters
LOGGER = logging.getLogger(__name__)
LOGGER.addHandler(logging.NullHandler())
[docs]def block_grid(in_shapefilename, pixel_size, out_rasterfilename,
out_vesperfilename, nodata_val=-9999, snap=True, overwrite=False):
"""Convert a polygon boundary to a 0,1 raster and generate a VESPER compatible list of
coordinates for kriging.
Args:
in_shapefilename (str): Input polygon shapefile
pixel_size (float): The required output pixel size
out_rasterfilename (str): Filename of the raster Tiff that will be created
out_vesperfilename (str): The output vesper file
nodata_val (int): an integer to use as nodata
snap (bool): Snap Extent to a factor of the Pixel size
overwrite (bool): if true overwrite existing file
Requirements: Input shapefile should be in a projected coordinate system........
Notes:
Define pixel_size and no data value of new raster
see http://stackoverflow.com/questions/2220749/rasterizing-a-gdal-layer
https://gis.stackexchange.com/a/31658
This works, but the extent of the output differs from that of an arcpy generated raster.
See post: https://gis.stackexchange.com/q/139336
"""
if not isinstance(pixel_size, (int, long, float)):
raise TypeError('Pixel size must be an integer or floating number.')
if not isinstance(nodata_val, (int, long)):
raise TypeError('Nodata value must be an integer.')
desc_poly_shp = VectorDescribe(in_shapefilename)
if not desc_poly_shp.crs.srs.IsProjected():
raise SpatialReferenceError('Shapefile must be in a projected coordinate system')
if 'POLY' not in desc_poly_shp.geometry_type.upper():
raise GeometryError('Invalid Geometry. Input shapefile should be polygon or multipolygon')
start_time = time.time()
convert_polygon_to_grid(in_shapefilename=in_shapefilename,
out_rasterfilename=out_rasterfilename,
pixel_size=pixel_size,
snap_extent_to_pixel=snap,
nodata_val=nodata_val,
overwrite=overwrite)
convert_grid_to_vesper(in_rasterfilename=out_rasterfilename,
out_vesperfilename=out_vesperfilename)
LOGGER.info('{:<30}\t{dur:<15}\t{}'.format(inspect.currentframe().f_code.co_name, '',
dur=timedelta(seconds=time.time() - start_time)))
[docs]def create_polygon_from_point_trail(points_geodataframe, points_crs, out_filename, thin_dist_m=1.0,
aggregate_dist_m=25, buffer_dist_m=10, shrink_dist_m=3):
"""Create a polygon from a Point Trail created from a file containing GPS coordinates.
The point order should be sorted by increasing time sequence to ensure the 'dot-to-dot' occurs
correctly.
The workflow is as follows:
Points -> Lines -> Buffer Out (expand) -> Buffer In (shrink)
For efficiency, points will be thinned. This will remove points less than the set thinDist.
Resulting points will be connected to form lines.
Line ends are detected where the distance between points is greater than the aggregate distance.
This will occur at a turning point or interruption (ie creek) and gps point collection is
stopped. Typically this distance is slightly greater than the row/swath width.
Buffering is used to convert to polygon and remove the gap between row_count/swaths. The buffer
distance is usually half the row/swath width.
Buffering with a negative value is used to remove excess area on the outside of the polygon.
This shrink distance is usually around 7m less than the buffer distance.
Args:
points_geodataframe (geopandas.geodataframe.GeoDataFrame): Input points vector geodataframe
points_crs (pyprecag_crs.crs): The Projected Spatial Reference System of the
point_geodataframe
out_filename (str): Output polygon file. This should be the same format as the input.
thin_dist_m (float): The minimum distance in metres between points to be used to thin the
points dataset.
aggregate_dist_m (int): A floating number representing the maximum distance between point.
This is used to detect a line end. Typically this is slightly larger than
the row/swath width.
buffer_dist_m (int): The Buffer distance in metres. Typically half the swath or row width.
shrink_dist_m (int): The shrink distance in metres. Typically about 7 less than the buffer
distance.
"""
for argCheck in [('thin_dist_m', thin_dist_m), ('aggregate_dist_m', aggregate_dist_m),
('buffer_dist_m', buffer_dist_m), ('shrink_dist_m', shrink_dist_m)]:
if not isinstance(argCheck[1], (int, long, float)):
raise TypeError('{} must be a floating number.'.format(argCheck[0]))
if not isinstance(points_geodataframe, GeoDataFrame):
raise TypeError('Invalid input data : inputGeoDataFrame')
if 'POINT' not in ','.join(list(points_geodataframe.geom_type.unique())).upper():
raise TypeError('Invalid input data : A points geopandas dataframe is required')
if not isinstance(points_crs, pyprecag_crs.crs):
raise TypeError('Crs must be an instance of pyprecag.crs.crs')
if out_filename is None or out_filename == '':
raise TypeError('Please specify an output filename')
if not os.path.exists(os.path.dirname(out_filename)):
raise IOError('Output directory {} does not exist'.format(os.path.dirname(out_filename)))
start_time = time.time()
points_geodataframe = points_geodataframe.copy()
if 'FID' not in points_geodataframe.columns:
points_geodataframe['FID'] = points_geodataframe.index
# don't need any attribution so drop it all except fid and geometry
dropcols = [ea for ea in points_geodataframe.columns.tolist() if ea not in ['geometry', 'FID']]
points_geodataframe.drop(dropcols, axis=1, inplace=True)
ptsgdf_crs = points_crs.epsg
gdf_thin = thin_point_by_distance(points_geodataframe, points_crs, thin_dist_m)
gdf_thin = gdf_thin[gdf_thin['filter'].isnull()].copy()
del points_geodataframe
dropcols = [ea for ea in gdf_thin.columns.tolist() if ea not in ['geometry', 'FID']]
gdf_thin.drop(dropcols, axis=1, inplace=True)
step_time = time.time()
# calc distance between two points
gdf_thin['dist_shift'] = gdf_thin.distance(gdf_thin.shift(1))
# Calc first value to be 0
gdf_thin.loc[gdf_thin['dist_shift'].isnull(), 'dist_shift'] = 0
# apply a new row/line id when distance between points is greater than 25m
gdf_thin["lineID"] = (gdf_thin["dist_shift"] >= aggregate_dist_m).cumsum()
LOGGER.info('{:<30} {:<15} {dur}'.format('Assign lineID', '',
dur=timedelta(seconds=time.time() - step_time)))
temp_file_list = []
if config.get_debug_mode():
temp_file_list = [os.path.join(TEMPDIR, os.path.basename(
out_filename.replace('.shp', '_0thnpts.shp')))]
save_geopandas_tofile(gdf_thin, temp_file_list[-1], overwrite=True)
step_time = time.time()
# A line must have at least two points so remove lines with only one point.
# count all points per lineID
ptsperline = gdf_thin.groupby('lineID').count()
# select all LineID's with more than one point
gdf_thin = gdf_thin.loc[gdf_thin['lineID'].isin(ptsperline[ptsperline['geometry'] > 1].index)]
# convert to lines
# source https://gis.stackexchange.com/a/202274
try:
# Aggregate these points with the GroupBy - for shapely 1.3.2+ use
df_line = gdf_thin.groupby(['lineID'])['geometry'].apply(lambda x: LineString(x.tolist()))
except ValueError:
df_line = gdf_thin.groupby(['lineID'])['geometry'].apply(
lambda x: LineString([(p.x, p.y) for p in x]))
gdf_final = GeoDataFrame(df_line, geometry='geometry')
del gdf_thin, ptsperline, df_line
gdf_final.crs = ptsgdf_crs
LOGGER.info('{:<30} {:<15} {dur}'.format('Convert to lines', '',
dur=timedelta(seconds=time.time() - step_time)))
if config.get_debug_mode():
temp_file_list.append(os.path.join(TEMPDIR, os.path.basename(
out_filename.replace('.shp', '_1line.shp'))))
save_geopandas_tofile(gdf_final, temp_file_list[-1], overwrite=True)
# buffer and dissolve overlap of lines
step_time = time.time()
gdf_final = GeoDataFrame(geometry=[gdf_final.buffer(buffer_dist_m).unary_union])
gdf_final.crs = ptsgdf_crs
gdf_final['FID'] = gdf_final.index
LOGGER.info('{:<30} {:<15} {dur}'.format('Buffer by {}'.format(buffer_dist_m), '',
dur=timedelta(seconds=time.time() - step_time)))
if config.get_debug_mode():
temp_file_list.append(os.path.join(
TEMPDIR, os.path.basename(out_filename.replace('.shp', '_2buf.shp'))))
save_geopandas_tofile(gdf_final, temp_file_list[-1], overwrite=True)
if shrink_dist_m != 0:
step_time = time.time()
gdf_final = GeoDataFrame(geometry=gdf_final.buffer(-abs(shrink_dist_m)))
gdf_final.crs = ptsgdf_crs
LOGGER.info('{:<30} {:<15} {dur}'.format('Shrink by {}'.format(shrink_dist_m), '',
dur=timedelta(seconds=time.time() - step_time)))
if config.get_debug_mode():
temp_file_list.append(os.path.join(
TEMPDIR, os.path.basename(out_filename.replace('.shp', '_3bufshrink.shp'))))
save_geopandas_tofile(gdf_final, temp_file_list[-1], overwrite=True)
step_time = time.time()
# https://github.com/geopandas/geopandas/issues/174#issuecomment-63126908
explode = gdf_final.explode().reset_index().rename(
columns={0: 'geometry'}).set_geometry('geometry')['geometry']
gdf_final = GeoDataFrame(geometry=explode)
gdf_final.crs = ptsgdf_crs
gdf_final['FID'] = gdf_final.index
gdf_final['Area'] = gdf_final.area
gdf_final['Perimeter'] = gdf_final.length
LOGGER.info('{:<30} {:<15} {dur}'.format('Explode polygons', '',
dur=timedelta(seconds=time.time() - step_time)))
save_geopandas_tofile(gdf_final, out_filename, overwrite=True)
LOGGER.info('{:<30}\t{dur:<15}\t{}'.format(inspect.currentframe().f_code.co_name, '',
dur=timedelta(seconds=time.time() - start_time)))
thin_ratio = (4 * 3.14 * gdf_final['Area'].sum() /
(gdf_final['Perimeter'].sum() * gdf_final['Perimeter'].sum()))
LOGGER.debug('{:<25} {:.5f}'.format('Thin Ratio : ', thin_ratio))
if thin_ratio < 0.01:
LOGGER.warning('For an improved result, increase the buffer width and shrink '
'distance and try again.')
return 'For an improved result, increase the buffer width and shrink distance and ' \
'try again.'
else:
return
[docs]def clean_trim_points(points_geodataframe, points_crs, process_column, output_csvfile,
boundary_polyfile=None, out_keep_shapefile=None, out_removed_shapefile=None,
remove_zeros=True, stdevs=3, iterative=True, thin_dist_m=1.0):
""" Clean and/or Trim a points dataframe.
Preparation includes:
- Clip data to polygon.
- Move data to a Projected Coordinate system.
- Remove values where data_column are less than or equal to zero
- Calculate Normalised value for data_column (number of StDev).
- Iteratively Trim outliers based on Normalised data_column
- Remove points closer than a set distance (trim_dist_m)
Args:
points_geodataframe (geopandas.geodataframe.GeoDataFrame): The input points geodataframe
points_crs (pyprecag_crs.crs): The Spatial Reference System of the point_geodataframe
process_column (str): The column to normalise, trim and clean.
output_csvfile (str): The Trimmed & Cleaned output CSV file
out_keep_shapefile (str): Optionally save the Trimmed & Cleaned to a shapefile
out_removed_shapefile (str): Optionally save a shapefile containing features removed while
cleaning/filtering. A column called filter will be added showing the reason a
point was removed.
boundary_polyfile (str): Optionally a polygon used to Clip the points.
remove_zeros (bool): Optionally remove values where data_column are <= to zero
prior to removing outliers.
stdevs (int): The number of standard deviations used to trim outliers
iterative (bool): Optionally Iteratively Trim outliers based on Normalised Data column
thin_dist_m (float): A distance in metres representing the minimum allowed distance
between points. Points less than this distance will be removed.
Returns:
geopandas.geodataframe.GeoDataFrame: Representing the cleaned/trimmed data file.
New columns included in the output are:
Eastings, Northings - The point coordinates in a projected coordinate system
EN_Epsg - The epsg number of the projected coordinate system
Nrm_ - normalise data_column calculation.
pyprecag_crs.crs: The pyprecag CRS object of the points dataframe.
"""
if not isinstance(points_geodataframe, GeoDataFrame):
raise TypeError('Invalid input data : inputGeodataFrame')
if 'POINT' not in ','.join(list(points_geodataframe.geom_type.unique())).upper():
raise TypeError('Invalid input data : a points geopandas dataframe is required')
if not isinstance(points_crs, pyprecag_crs.crs):
raise TypeError('Crs must be an instance of pyprecag.crs.crs')
if output_csvfile is None:
raise TypeError('Invalid input data : an output csv path and file is required')
for argCheck in [output_csvfile, out_keep_shapefile, out_removed_shapefile]:
if argCheck is None:
continue
if not os.path.exists(os.path.dirname(argCheck)):
raise IOError('Output folder does not exist: {}'.format(os.path.dirname(argCheck)))
for argCheck in [('thinDist_m', thin_dist_m), ('stdev', stdevs)]:
if not isinstance(argCheck[1], (int, long, float)):
raise TypeError('{} must be a integer or floating number.'.format(argCheck[0]))
if process_column not in points_geodataframe.columns:
raise TypeError('Column {} does not exist in input geodataframe'.format(process_column))
for argCheck in [('remove_zeros', remove_zeros), ('iterative', iterative)]:
if not isinstance(argCheck[1], bool):
raise TypeError('{} should be a boolean.'.format(argCheck[0]))
norm_column = 'nrm_' + process_column
LOGGER.info('Normalized Column is {}'.format(norm_column))
if norm_column in points_geodataframe.columns:
LOGGER.warning('Column {} already exists and will be overwritten'.format(norm_column))
if boundary_polyfile is not None:
if not os.path.exists(boundary_polyfile):
raise IOError("Invalid path: {}".format(boundary_polyfile))
ply_desc = VectorDescribe(boundary_polyfile)
if 'POLY' not in ply_desc.geometry_type.upper():
raise GeometryError('Invalid geometry. Input shapefile should be polygon'
' or multipolygon')
start_time = time.time()
# set a UniqueID Field which ISNT the FID for use through out the processing
id_col = 'PT_UID' # IE clean/trim fid
points_geodataframe[id_col] = points_geodataframe.index
gdf_points = points_geodataframe.copy()
# To speed things up, drop all unrequired columns
dropcols = [ea for ea in gdf_points.columns.tolist()
if ea not in ['geometry', id_col, norm_column, process_column]]
gdf_points.drop(dropcols, axis=1, inplace=True)
step_time = time.time()
# set defaults
gdf_points['filter'] = np.nan
gdf_points['filter_inc'] = 0
# Remove rows where data col is empty/null
subset = gdf_points[gdf_points[process_column].isnull()]
if len(subset) > 0:
gdf_points.loc[subset.index, 'filter'] = 'nulls'
gdf_points.loc[~gdf_points.index.isin(subset.index), 'filter_inc'] \
= len(gdf_points['filter'].value_counts())
subset = gdf_points[gdf_points['filter'].isnull()].copy()
# Remove duplicated geometries
# https://github.com/geopandas/geopandas/issues/521#issuecomment-382806444
geom = subset["geometry"].apply(lambda geom: geom.wkb)
subset = subset.loc[geom.drop_duplicates().index]
gdf_points.loc[~gdf_points.index.isin(subset.index), 'filter'] = 'Duplicate XY'
gdf_points.loc[~gdf_points.index.isin(subset.index), 'filter_inc'] \
= len(gdf_points['filter'].value_counts())
LOGGER.info('{:<30} {:>10,} {:<15} {dur}'.format(
'Remove Duplicate XYs', len(geom) - len(subset), '',
dur=timedelta(seconds=time.time() - start_time)))
subset = gdf_points[gdf_points['filter'].isnull()].copy()
if boundary_polyfile is not None:
gdf_poly = ply_desc.open_geo_dataframe()
if ply_desc.crs.epsg != points_crs.epsg_number:
# Preference is to the projected coordinate system then only project the smaller
# dataset (usually poly) By now points should be in the out projected coordinate system
if ply_desc.crs.epsg != points_crs.epsg_number:
gdf_poly.to_crs(epsg=points_crs.epsg_number, inplace=True)
LOGGER.info('{:<30} {:>10} {:<15} {dur}'.format(
'Reproject clip polygon', '', 'To epsg_number {}'.format(points_crs.epsg_number),
dur=timedelta(seconds=time.time() - step_time)))
step_time = time.time()
# Clip to boundary then apply to filter column
subset = subset[subset.intersects(gdf_poly.unary_union)].copy()
del gdf_poly
if len(subset) == 0:
raise GeometryError('Clipping removed all features. Check coordinate systems and/or '
'clip polygon layer and try again')
gdf_points.loc[~gdf_points.index.isin(subset.index), 'filter'] = 'clip'
gdf_points.loc[~gdf_points.index.isin(subset.index), 'filter_inc'] \
= len(gdf_points['filter'].value_counts())
LOGGER.info('{:<30} {:>10,} {:<15} {dur} '.format(
'Clip', len(subset), '', dur=timedelta(seconds=time.time() - step_time)))
step_time = time.time()
if remove_zeros:
subset = subset[subset[process_column] <= 0]
gdf_points.loc[subset.index, 'filter'] = 'zero'
gdf_points.loc[subset.index, 'filter_inc'] = len(gdf_points['filter'].value_counts())
if len(gdf_points[gdf_points['filter'].isnull()]) == 0:
raise GeometryError("Zero filter removed all points "
"in column {}".format(process_column))
if stdevs > 0:
i = 0
# Get a fresh copy of subset.
subset = gdf_points[gdf_points['filter'].isnull()].copy()
while len(subset) > 0:
i += 1
subset = gdf_points[gdf_points['filter'].isnull()].copy()
yld_mean = subset[process_column].mean()
yld_std = subset[process_column].std()
subset[norm_column] = subset[process_column].apply(lambda x: (x - yld_mean) / yld_std)
subset = subset[subset[norm_column].abs() >= stdevs]
gdf_points.loc[subset.index, 'filter'] = '{} std iter {}'.format(stdevs, int(i))
gdf_points.loc[subset.index, 'filter_inc'] = len(gdf_points['filter'].value_counts())
if not iterative:
break
LOGGER.info('{:<30} {:>10,} {:<15} {dur}'.format(
'Filter by column', len(gdf_points[gdf_points['filter'].isnull()]),
"{} (zeros, norm, {} std)".format(process_column, stdevs),
dur=timedelta(seconds=time.time() - step_time)))
gdf_thin = thin_point_by_distance(gdf_points[gdf_points['filter'].isnull()],
points_crs, thin_dist_m)
step_time = time.time()
# update(join/merge) gdfPoints['filter'] column with results from thinning.
gdf_points.loc[gdf_points.index.isin(gdf_thin.index), 'filter'] = gdf_thin['filter']
gdf_points.loc[gdf_points.index.isin(gdf_thin.index), 'filter_inc'] = \
gdf_thin['filter_inc'] + len(gdf_points['filter_inc'].value_counts())
del gdf_thin
# Add the incremental number to the filter column ie '01 clip'
gdf_points['filter'] = gdf_points[['filter_inc', 'filter']].dropna().apply(
lambda x: '%02d %s' % (x[0], x[1]), axis=1)
# after last iteration of thin by distance subset should be
yld_mean = gdf_points[gdf_points['filter'].isnull()].mean()[process_column]
yld_std = gdf_points[gdf_points['filter'].isnull()].std()[process_column]
gdf_points.loc[gdf_points['filter'].isnull(), norm_column] = \
(gdf_points[process_column] - yld_mean) / yld_std
# prepare some summary results for filtered features.
# Filter is the reason a point is removed,and filter_inc keeps them in the order they were
# removed. ie std it before thining by distance.
results_table = gdf_points.copy()
results_table['filter'].fillna('Pts remaining', inplace=True)
results_table['filter_inc'].fillna(len(results_table['filter'].value_counts()), inplace=True)
# Get a count of features per filter. filter_inc maintains the sort order
results_table = results_table.groupby(['filter_inc', 'filter']).size() \
.to_frame('feat_count').reset_index('filter')
# and add the total of the results
total_row = pd.DataFrame(data=[[len(results_table) + 1, 'Total',
results_table['feat_count'].sum()]],
columns=['filter_inc', 'filter', 'feat_count']).set_index('filter_inc')
# Add this to results table
results_table = results_table.append(total_row)
# Clean up filtered results by removing all columns except those new ones which
# have to be copied back to original
dropcols = [ea for ea in gdf_points.columns.tolist()
if ea not in ['geometry', norm_column, 'filter', id_col]]
gdf_points.drop(dropcols, axis=1, inplace=True)
# Add x,y coordinates to match coordinate system
gdf_points['Easting'] = gdf_points.geometry.apply(lambda p: p.x)
gdf_points['Northing'] = gdf_points.geometry.apply(lambda p: p.y)
gdf_points['EN_EPSG'] = points_crs.epsg_number
# Clean up the original input dataframe and remove existing geometry and coord columns
alt_coord_columns = config.get_config_key('geoCSV')['xCoordinate_ColumnName']
alt_coord_columns += config.get_config_key('geoCSV')['yCoordinate_ColumnName']
# Find and Drop coord columns if already exist.
coord_columns = [fld for fld in points_geodataframe.columns if fld.upper() in alt_coord_columns]
coord_columns = coord_columns + ['geometry']
if len(coord_columns) > 0:
points_geodataframe.drop(coord_columns, axis=1, inplace=True)
# Use geopandas merge instead of concat to maintain coordinate system info etc.
gdf_final = points_geodataframe.merge(gdf_points, on=id_col, how='outer')
# move newer columns to end (geometry, filter, etc)
gdf_final.drop([id_col], inplace=True, axis=1)
# get the appropriate file encoding and save the file
file_encoding = get_dataframe_encoding(gdf_final)
gdf_final[gdf_final['filter'].isnull()].drop(['geometry', 'filter'], axis=1) \
.to_csv(output_csvfile, index=False, encoding=file_encoding)
LOGGER.info('{:<30} {:>10,} {:<15} {dur}'.format(
'Save to CSV', len(gdf_final[gdf_final['filter'].isnull()]),
os.path.basename(output_csvfile), dur=timedelta(seconds=time.time() - step_time)))
step_time = time.time()
# gdfPoints have all points with a filter string assigned
if out_keep_shapefile is not None and out_keep_shapefile != '':
save_geopandas_tofile(gdf_final[gdf_final['filter'].isnull()].drop(['filter'], axis=1),
out_keep_shapefile, overwrite=True)
LOGGER.info('{:<30} {:>10,} {:<15} {dur}'.format(
'Save to Shapefile', len(gdf_final[gdf_final['filter'].isnull()]),
os.path.basename(out_keep_shapefile), dur=timedelta(seconds=time.time() - step_time)))
step_time = time.time()
if out_removed_shapefile is not None and out_removed_shapefile != '':
save_geopandas_tofile(gdf_final[gdf_final['filter'].notnull()]
.drop([norm_column], axis=1),
out_removed_shapefile, overwrite=True)
LOGGER.info('{:<30} {:>10,} {:<15} {dur}'
.format('Save removed Shapefile', len(gdf_final[gdf_final['filter'].notnull()]),
os.path.basename(out_removed_shapefile),
dur=timedelta(seconds=time.time() - step_time)))
LOGGER.info('\nResults:---------------------------------------\n{}\n'.format(
results_table.to_string(index=False, justify='center')))
LOGGER.info('{:<30}\t{dur:<15}\t{}'.format(inspect.currentframe().f_code.co_name, '',
dur=timedelta(seconds=time.time() - start_time)))
return gdf_final[gdf_final['filter'].isnull()], points_crs
[docs]def random_pixel_selection(raster, raster_crs, num_points, out_shapefile=None):
"""Select randomly distributed valid data pixels from a raster file and convert to points
representing the center of the pixel. There is an option to save to shapefile if required.
Note:
When opening a raster using RasterIO the coordinate system is imported from the Proj4 string
and converted to a crs_wkt. This means that the crs_wkt recorded against the opened dataset
does not always equal the crs_wkt of the original raster file. To remedy this, use
pyprecag_crs.getCRSfromRasterFile to create a crs object
Args:
raster (rasterio.io.DatasetReader):Opened raster file via rasterio.open(os.path.normpath())
num_points (int): The number of random sample points to select.
raster_crs (pyprecag_crs.crs): The Spatial Reference System for the raster file
out_shapefile (str):Optional.. the path and name of a shapefile used to save the points.
Returns:
geopandas.geodataframe.GeoDataFrame: A dataframe containing the select pixels as points
pyprecag_crs.crs: The pyprecag CRS object of the points dataframe.
"""
if not isinstance(raster, rasterio.DatasetReader):
raise TypeError("Input should be a rasterio.DatasetReader created "
"using rasterio.open(os.path.normpath())")
if not isinstance(num_points, (int, long)):
raise TypeError('Size must be an Integer.')
if not isinstance(raster_crs, pyprecag_crs.crs):
raise TypeError('Crs must be an instance of pyprecag.crs.crs')
if out_shapefile is not None:
if not os.path.exists(os.path.dirname(out_shapefile)):
raise IOError('Output directory {} does not exist'.format(
os.path.dirname(out_shapefile)))
elif config.get_debug_mode():
out_shapefile = '{}_randompts.shp'.format(inspect.getframeinfo(inspect.currentframe())[2])
band1 = raster.read(1, masked=True)
if num_points > band1.count():
raise ValueError('Size parameter is greater than the number of data pixels')
# Sources: enumerate using a masked array https://stackoverflow.com/a/8621145
# Create a geodataframe while iterating.... (Cell 2)
# http://nbviewer.jupyter.org/github/geopandas/geopandas/blob/master/examples/overlays.ipynb
# Create a flattened mask array
mask = ~band1.mask.ravel()
# create an array with the row/col address and current pixel value for data pixels only
# np.ndenumerate(image_read) returns ((row,col),val) which can be unpacked
# using ((r,c),v) and converted to a list using (int(r),int(c),v)
band1_index = [(int(r), int(c), v) for ((r, c), v), m in izip(np.ndenumerate(band1), mask) if m]
# get a fixes random sample
rand_samp = random.sample(band1_index, num_points)
# add coordinates for each sample point
rand_samp_xy = [pt + raster.xy(pt[0], pt[1]) for pt in rand_samp]
# convert to geopandas dataframe. Drop the data value from the output.
random_pts_gdf = GeoDataFrame([{'geometry': Point(pt[3], pt[4]), 'PtID': i, 'X': pt[3],
'Y': pt[4]}
for i, pt in enumerate(rand_samp_xy)], crs=raster_crs.epsg)
if out_shapefile is not None:
save_geopandas_tofile(random_pts_gdf, out_shapefile, overwrite=True)
return random_pts_gdf, raster_crs
[docs]def multi_block_bands_processing(image_file, pixel_size, out_folder, band_nums=[], image_epsg=0,
image_nodata=0, polygon_shapefile=None, groupby=None):
"""Derive multiple resampled image bands matching the specified pixel size and block grid extent
for each shapefile polygon.
Use this tool create individual band images for each polygon within a shapefile. A group-by
column may be used to dissolve multiple polygons belonging to an individual block. The fitting
of rasters to a base Block (grid) ensures for easier, more accurate multi-layered analysis
required by in Precision Agriculture.
The processing steps to achieve this are:
- Dissolve polygons optionally using the group-by column
Loop through each polygon feature and......
- Create Block Grid
- Clip image to polygon
- Resample and fit to block grid for a pixel size and using Average resampling technique
- Identify holes and fill if necessary
- smooth image using a 5x5 pixel moving average (focal_statistics)
If a polygon shapefile is not specified, polygons will be created from the images' mask
The polygon shapefile will be re-projected to match the image file
A block grid will be created for each feature for the nominated pixel size and used as the base
for analysis
"image_epsg" and "image_nodata" can be used to set the coordinate system and image nodata values
when they are not present within the image file.
The output filename will consist of the selected band number or the value of a band's rasterio
custom name tag
Args:
image_file (str): An input image
pixel_size (int, float): The desired output pixel size in metres.
out_folder (str): The output folder for the created images.
band_nums (List[int]): a list of band numbers to process. If empty all bands will be used
image_epsg (int): the epsg number for the image. Only used if not provided by the image.
image_nodata (int): the nodata value for the image. Only used if not provided by the image.
polygon_shapefile (str): a polygon shapefile used to cut up an image.
groupby (str): the column/field to use to group multiple features.
Returns:
List[str]: a list of created files.
"""
if isinstance(polygon_shapefile, str) and polygon_shapefile.strip() == '':
polygon_shapefile = None
if isinstance(groupby, str) and groupby.strip() == '':
groupby = None
for ea_arg in [('image_file', image_file), ('out_folder', out_folder),
('polygon_shapefile', polygon_shapefile)]:
if ea_arg[0] != 'polygon_shapefile':
if ea_arg[1] is None or ea_arg[1].strip() == '':
raise ValueError('{} is required '.format(ea_arg[0]))
if ea_arg[1] is not None and not os.path.exists(ea_arg[1]):
raise IOError('{} does not exist-Got {}'.format(ea_arg[0], os.path.dirname(ea_arg[1])))
for ea_arg in [('pixel_size', pixel_size), ('image_nodata', image_nodata)]:
if not isinstance(ea_arg[1], (int, long, float)):
raise TypeError('{} must be a integer or floating number - Got {}'.format(*ea_arg))
if not isinstance(image_epsg, (int, long)):
raise TypeError('image_epsg must be a integer - Got {}'.format(image_epsg))
if not isinstance(band_nums, list):
raise TypeError('band_nums must be a list of bands to resample')
if len(band_nums) == 0:
band_nums = list(rasterio.open(image_file).indexes)
else:
image_bands = list(rasterio.open(image_file).indexes)
band_check = [val for val in band_nums if val not in image_bands]
if len(band_check) > 0:
raise ValueError('{} are invalid bands.'.format(', '.join(band_check)))
if polygon_shapefile is not None:
vect_desc = VectorDescribe(polygon_shapefile)
if 'POLYGON' not in vect_desc.geometry_type.upper():
raise GeometryError('Invalid geometry. Input shapefile should be poly or multipolygon')
if groupby is not None and groupby not in vect_desc.get_column_names():
raise ValueError('Input groupby column does not exist in the shapefile')
del vect_desc
pixel_size_str = numeric_pixelsize_to_string(pixel_size)
filename, ext = os.path.splitext(os.path.basename(image_file))
temp_file_list = []
# ----------------------------------------------------------------------------------------------
# Run coordinate system checks on the input image
with rasterio.open(os.path.normpath(image_file)) as src:
if src.crs is None:
if image_epsg is None or image_epsg == 0:
raise ValueError('Input coordinate system required - image_file does not contain'
'a coordinate system, and in_epsg is 0')
else:
image_epsg = pyprecag_crs.getCRSfromRasterFile(image_file).epsg_number
# ----------------------------------------------------------------------------------------------
# run checks on input polygon, and if required reproject it to the raster coordinate system
gdf_poly = None
if groupby is None or groupby.strip() == '':
groupby = None
if polygon_shapefile is None:
# create a polygon from the image. hopefully by now the nodata val is correct.
with rasterio.open(image_file) as src:
# source: https://gis.stackexchange.com/a/187883
# Read dataset's valid data mask as a ndarray and extract from all bands (dataset_mask).
band = src.dataset_mask()
mask = band == 255 # mask is where band == 255
# Extract feature shapes and values from the array.
rast_shapes = rasterio.features.shapes(band, transform=src.transform, mask=mask)
geoms_geojson = [{'properties': {'val': val}, 'geometry': geom} for i, (geom, val) in
enumerate(rast_shapes)]
gdf_poly = GeoDataFrame.from_features(list(geoms_geojson), crs=from_epsg(image_epsg))
if config.get_debug_mode():
temp_file_list += [os.path.join(TEMPDIR, '{}_{}BlockBoundary_{}.shp'
.format(filename, len(temp_file_list) + 1,
image_epsg))]
save_geopandas_tofile(gdf_poly, temp_file_list[-1], overwrite=True)
del band, rast_shapes, geoms_geojson, mask
else:
desc_poly = VectorDescribe(polygon_shapefile)
if 'POLY' not in desc_poly.geometry_type.upper():
raise ValueError('Invalid input data : a polygon shapefile is required')
gdf_poly = desc_poly.open_geo_dataframe()
if groupby is not None and groupby not in gdf_poly.columns:
raise ValueError('Groupby column {} does not exist'.format(groupby))
if groupby is not None:
# change null/nones to blank string
gdf_poly[groupby].fillna('', inplace=True)
# reproject shapefile to raster
poly_epsg = desc_poly.crs.epsg_number
if poly_epsg != image_epsg:
# we have to reproject the vector points to match the raster
print 'WARNING: Projecting points from {} to {}'.format(desc_poly.crs.epsg, image_epsg)
gdf_poly.to_crs(epsg=image_epsg, inplace=True)
poly_epsg = image_epsg
del desc_poly
# If a column name is specified, dissolve by this and create multi-polygons
# otherwise dissolve without a column name at treat all features as one
if len(gdf_poly) > 1:
if groupby is not None:
gdf_poly = gdf_poly.dissolve(groupby, as_index=False).copy()
else:
gdf_poly = GeoDataFrame(geometry=[gdf_poly.unary_union])
# Loop through polygns features ----------------------------------------------------------------
output_files = []
for index, feat in gdf_poly.iterrows():
loop_time = time.time()
step_time = time.time()
status = '{} of {}'.format(index + 1, len(gdf_poly))
if groupby is not None:
feat_name = re.sub('[^0-9a-zA-Z]+', '-', str(feat[groupby])).strip('-')
if feat_name == '':
feat_name = 'No-Name'
else:
feat_name = ''
# From shapes create a blockgrid mask. -----------------------------------------------------
blockgrid_memfile = MemoryFile()
meta_block = rasterio.open(image_file).meta.copy()
blockgrid, new_blockmeta = convert_polygon_feature_to_raster(feat, pixel_size)
meta_block.update(count=1, **new_blockmeta)
with blockgrid_memfile.open(**meta_block) as dest:
dest.write(blockgrid, 1)
del blockgrid
if config.get_debug_mode():
LOGGER.info('{:<30} {:>10} {:<15} {dur}'.format(
'Feature to block_grid', status, feat_name,
dur=timedelta(seconds=time.time() - step_time)))
temp_file_list += [os.path.join(TEMPDIR, '{}_{}BlockGrid_{}_{}.tif'.format(
filename, len(temp_file_list) + 1, pixel_size_str, feat_name))]
save_in_memory_raster_to_file(blockgrid_memfile, temp_file_list[-1])
# Clip Dataset. ---------------------------------------------------------------------
geoms = [mapping(feat.geometry)]
with rasterio.open(image_file) as src:
crop_data, crop_transform = rio_mask(src, geoms, crop=True)
for iband in band_nums: # range(1, src.count + 1):
if crop_data[iband - 1].min() == crop_data[iband - 1].max():
raise GeometryError("There is no overlap between your polygon features "
"and selected band(s)")
meta = src.meta.copy()
meta.update({"driver": "GTiff",
"height": int(crop_data.shape[1]),
"width": int(crop_data.shape[2]),
"transform": crop_transform})
clip_memfile = MemoryFile()
with clip_memfile.open(**meta) as dest:
dest.write(crop_data)
# reapply band tags to store the index name.
for iband in range(1, src.count + 1):
# image statistics have changed so don't copy the tags
cleaned_tags = dict(
[(key, val) for key, val in src.tags(iband).iteritems() if
not key.upper().startswith('STATISTIC')])
if len(cleaned_tags) > 0:
dest.update_tags(iband, **cleaned_tags)
if config.get_debug_mode():
LOGGER.info('{:<30} {:>10} {:<15} {dur}'.format(
'Clipped Image to Feature', status, feat_name,
dur=timedelta(seconds=time.time() - step_time)))
temp_file_list += [os.path.join(TEMPDIR, '{}_{}crop_{}.tif'.format(
filename, len(temp_file_list) + 1, feat_name))]
save_in_memory_raster_to_file(clip_memfile, temp_file_list[-1])
step_time = time.time()
del crop_data, crop_transform, geoms
# Resample to new pixel size using averaging -----------------------------------------------
with clip_memfile.open() as src:
meta = src.meta.copy()
meta.update({'height': meta_block['height'],
'width': meta_block['width'],
'transform': meta_block['transform']})
# Resample all bands and write to file
resamp_memfile = MemoryFile()
with resamp_memfile.open(**meta) as dest:
for iband in range(1, src.count + 1):
reproject(source=rasterio.band(src, iband),
src_transform=src.transform,
destination=rasterio.band(dest, iband),
dst_transform=dest.transform,
resampling=Resampling.average)
# image statistics have changed so don't copy the tags
cleaned_tags = dict(
[(key, val) for key, val in src.tags(iband).iteritems() if
not key.upper().startswith('STATISTIC')])
if len(cleaned_tags) > 0:
dest.update_tags(iband, **cleaned_tags)
if config.get_debug_mode():
LOGGER.info('{:<30} {:>10} {:<15} {dur}'.format(
'Resampled to {}m'.format(pixel_size), status, feat_name,
dur=timedelta(seconds=time.time() - step_time)))
temp_file_list += [os.path.join(TEMPDIR, '{}_{}resampleAV_{}_{}.tif'.format(
filename, len(temp_file_list) + 1, pixel_size_str, feat_name))]
save_in_memory_raster_to_file(resamp_memfile, temp_file_list[-1])
step_time = time.time()
del clip_memfile
# get a count of holes to see if fill is required.
fillholes_memfile = MemoryFile()
with resamp_memfile.open() as src:
apply_nodata = int(src.nodata) if src.nodata.is_integer() else src.nodata
with blockgrid_memfile.open() as src_bg:
# find holes where blockgrid has data and resample is nodata - this can be used as
# the mask in the fillnodata at a later stage if required.
hole_count = np.logical_and((src_bg.read(1, masked=True).mask == 0),
(src.read(1, masked=True).mask == 1)).sum()
if hole_count == 0:
with fillholes_memfile.open(**src.meta) as dest:
for iband in range(1, src.count + 1):
band = src.read(iband)
# reapply block grid mask
with blockgrid_memfile.open() as src_bg:
new_band = np.where((src_bg.read(1, masked=True).mask == 0),
band, apply_nodata)
dest.write(new_band, iband)
# image statistics have changed so don't copy the tags
cleaned_tags = dict([(key, val) for key, val in src.tags(iband).iteritems()
if not key.upper().startswith('STATISTIC')])
if len(cleaned_tags) > 0:
dest.update_tags(iband, **cleaned_tags)
# Fill holes ---------------------------------------------------------------------------
if hole_count > 0:
with fillholes_memfile.open(**src.meta) as dest:
for iband in range(1, src.count + 1):
# for rasterio 1.0.3+ the mask= options will interpolate values for all
# designated nodata pixels (marked by zeros in `mask`)
fill_nd = fillnodata(src.read(iband, masked=True),
mask=None, # Dont use mask option
max_search_distance=100, # default is 100
smoothing_iterations=0)
# reapply block grid mask
with blockgrid_memfile.open() as src_bg:
filled = np.where((src_bg.read(1, masked=True).mask == 0),
fill_nd, apply_nodata)
dest.write(filled, iband)
# image statistics have changed so don't copy the tags
cleaned_tags = dict([(key, val) for key, val in src.tags(iband).iteritems()
if not key.upper().startswith('STATISTIC')])
if len(cleaned_tags) > 0:
dest.update_tags(iband, **cleaned_tags)
del fill_nd, filled
if config.get_debug_mode():
LOGGER.info('{:<30} {:>10} {:<15} {dur}'.format(
'Filled {} holes'.format(hole_count), status, feat_name,
dur=timedelta(seconds=time.time() - step_time)))
step_time = time.time()
temp_file_list += [os.path.join(TEMPDIR, '{}_{}filled_{}_{}.tif'.format(
filename, len(temp_file_list) + 1, pixel_size_str, feat_name))]
save_in_memory_raster_to_file(fillholes_memfile, temp_file_list[-1])
del resamp_memfile
# Apply Focal Statistics to Smooth ---------------------------------------------------------
with fillholes_memfile.open() as src:
meta = src.meta.copy()
if config.get_debug_mode():
temp_file_list += [os.path.join(TEMPDIR, '{}_{}smoothed_{}_{}.tif'.format(
filename, len(temp_file_list) + 1, pixel_size_str, feat_name))]
for i, iband in enumerate(band_nums, start=1): # range(1, src.count + 1):
smooth, _ = focal_statistics(src, iband, size=5, clip_to_mask=True)
# change np.nan to a number
smooth[np.isnan(smooth)] = apply_nodata
meta.update(dtype=rasterio.dtypes.get_minimum_dtype(smooth))
if config.get_debug_mode():
meta.update(count=len(band_nums))
with rasterio.open(temp_file_list[-1], 'w+', **meta) as dest:
dest.write(smooth, i)
# image statistics have changed so don't copy the tags
cleaned_tags = dict([(key, val) for key, val in src.tags(i).iteritems()
if not key.upper().startswith('STATISTIC')])
if len(cleaned_tags) > 0:
dest.update_tags(i, **cleaned_tags)
if 'name' in src.tags(iband):
index_str = src.tags(iband)['name']
else:
index_str = 'Band{}'.format(iband)
if feat_name == '':
out_imagefile = '{}_{}'.format(index_str, pixel_size_str)
else:
out_imagefile = '{}_{}_{}'.format(feat_name, index_str, pixel_size_str)
out_imagefile = re.sub('-+', '-', out_imagefile).strip('_')
out_imagefile = os.path.join(out_folder, out_imagefile + '.tif')
meta.update(count=1)
with rasterio.open(out_imagefile, 'w', tfw='YES', **meta) as dest, \
rasterio.Env(GDAL_TIFF_INTERNAL_MASK=True):
dest.write(smooth, 1)
# image statistics have changed so don't copy the tags
cleaned_tags = dict(
[(key, val) for key, val in src.tags(iband).iteritems() if
not key.upper().startswith('STATISTIC')])
if len(cleaned_tags) > 0:
dest.update_tags(1, **cleaned_tags)
output_files += [out_imagefile]
del smooth
if config.get_debug_mode():
LOGGER.info('{:<30} {:>10} {:<15} {dur}'.format(
'Smoothed and saved to {} file(s)'.format(len(band_nums)),
status, feat_name, dur=timedelta(seconds=time.time() - step_time)))
step_time = time.time()
del fillholes_memfile, blockgrid_memfile
LOGGER.info('{:<30} {:>10} {:<15} {dur}'.format(
'Created {} files for feature'.format(len(band_nums)),
status, feat_name, dur=timedelta(seconds=time.time() - loop_time)))
# del dest, src, src_bg
return output_files
[docs]def calc_indices_for_block(image_file, pixel_size, band_map, out_folder, indices=[], image_epsg=0,
image_nodata=None, polygon_shapefile=None, groupby=None, out_epsg=0):
""" Calculate indices for a multi band image then resample to a specified pixel size
and block grid extent for each shapefile polygon.
Use this tool to create single band images, one for each index and shapefile polygon
combination. A group-by column may be used to dissolve multiple polygons belonging to an
individual block.
If a polygon shapefile is not specified, polygons will be created from the images' mask
The polygon shapefile will be re-projected to match the image file
A block grid will be created for each feature for the nominated pixel size and used as the
base for analysis
"image_epsg" and "image_nodata" can be used to set the coordinate system and image nodata
values when they are not present within the image file.
The output filename will consist of the selected feature and calculated index.
The processing steps to achieve this are:
- Reproject image to new coordinate system if required.
- Calculate indices
- Dissolve polygons optionally using the group-by column
Loop through each polygon feature and......
- Create Block Grid
- Clip image to polygon
- Resample and fit to block grid for a pixel size and using Average resampling technique
- Identify holes and fill if necessary
- smooth image using a 5x5 pixel moving average (focal_statistics)
Args:
image_file (str): the input image file
pixel_size (int): the pixel size used for resampling
band_map (pyprecag.bandops.BandMapping): A dictionary matching band numbers to
band type (ie Red, Green, Blue etc.)
out_folder (str): The output folder for the created images.
indices (List[str]): The list of indices to calculate.
image_epsg (int): epsg number of the image to be used when missing in image
image_nodata (int): nodata value of the image to be used when missing in image
polygon_shapefile (str): a polygon shapefile used to cut up an image.
groupby (str): the column/field to use to group multiple features.
out_epsg (int): The epsg number representing the coordinate system of the output images.
Returns:
List[str]: the list of created images.
"""
if isinstance(polygon_shapefile, str) and polygon_shapefile.strip() == '':
polygon_shapefile = None
for ea_arg in [('image_file', image_file), ('out_folder', out_folder),
('polygon_shapefile', polygon_shapefile)]:
if ea_arg[0] != 'polygon_shapefile':
if ea_arg[1] is None or ea_arg[1].strip() == '':
raise ValueError('{} is required '.format(ea_arg[0]))
if ea_arg[1] is not None and not os.path.exists(ea_arg[1]):
raise IOError('{} does not exist-Got {}'.format(ea_arg[0], os.path.dirname(ea_arg[1])))
for ea_arg in [('image_epsg', image_epsg), ('out_epsg', out_epsg)]:
if not isinstance(ea_arg[1], (int, long)):
raise TypeError('{} must be a integer - Got {}'.format(*ea_arg))
filename, ext = os.path.splitext(os.path.basename(image_file))
if not os.path.basename(out_folder) == os.path.basename(image_file).replace('.', '_'):
out_folder = os.path.join(out_folder, os.path.basename(image_file).replace('.', '_'))
if not os.path.exists(out_folder):
os.mkdir(out_folder)
with NamedTemporaryFile(prefix='{}_reproj_'.format(filename),
suffix='.tif', dir=TEMPDIR) as new_file:
reproj_image = os.path.normpath(new_file.name)
reproject_image(image_file, reproj_image, out_epsg, image_nodata=image_nodata,
image_epsg=image_epsg)
with NamedTemporaryFile(prefix='{}_indices_'.format(filename),
suffix='.tif', dir=TEMPDIR) as new_file:
indices_image = os.path.normpath(new_file.name)
calculate_image_indices(reproj_image, band_map, indices_image, indices)
if polygon_shapefile is None or polygon_shapefile.strip() == '':
with NamedTemporaryFile(prefix='{}_BlockPolygons_'.format(filename),
suffix='.shp', dir=TEMPDIR) as new_file:
polygon_shapefile = os.path.normpath(new_file.name)
# create a polygon from the image. hopefully by now the nodata val is correct.
with rasterio.open(reproj_image) as src:
band = src.dataset_mask()
mask = band == 255 # mask is where band == 255
# Extract feature shapes and values from the array.
rast_shapes = rasterio.features.shapes(band, transform=src.transform, mask=mask)
geoms_geojson = [{'properties': {'val': val}, 'geometry': geom} for i, (geom, val) in
enumerate(rast_shapes)]
gdf_poly = GeoDataFrame.from_features(list(geoms_geojson), crs=from_epsg(image_epsg))
save_geopandas_tofile(gdf_poly, polygon_shapefile, overwrite=True)
del band, rast_shapes, geoms_geojson, mask
out_files = multi_block_bands_processing(indices_image, pixel_size, out_folder,
polygon_shapefile=polygon_shapefile, groupby=groupby)
if not config.get_debug_mode():
if TEMPDIR in indices_image:
os.remove(indices_image)
if TEMPDIR in reproj_image:
os.remove(reproj_image)
if TEMPDIR in polygon_shapefile:
os.remove(polygon_shapefile)
return out_files
[docs]def resample_bands_to_block(image_file, pixel_size, out_folder, band_nums=[], image_epsg=0,
image_nodata=None, polygon_shapefile=None, groupby=None, out_epsg=0):
"""Derive multiple resampled image bands matching the specified pixel size and block grid extent
for each shapefile polygon.
Use this tool create individual band images for each polygon within a shapefile. A group-by
column may be used to dissolve multiple polygons belonging to an individual block. The fitting
of rasters to a base Block (grid) ensures for easier, more accurate multi-layered analysis
required by in Precision Agriculture.
The processing steps to achieve this are:
- Reproject image to nominated coordinate system
- Dissolve polygons optionally using the groupby column
Loop through each polygon feature and......
- Create Block Grid
- Clip image to polygon
- Resample and fit to block grid for a pixel size and using Average resampling technique
- Identify holes and fill if necessary
- smooth image using a 5x5 pixel moving average (focal_statistics)
If a polygon shapefile is not specified, polygons will be created from the images' mask
The polygon shapefile will be re-projected to match the image file
A block grid will be created for each feature for the nominated pixel size and used as the
base for analysis
"image_epsg" and "image_nodata" can be used to set the coordinate system and image nodata values
when they are not present within the image file.
The output filename will consist of the selected band number or the value of a band's rasterio
custom name tag
Args:
image_file (str): An input image
pixel_size (int, float): The desired output pixel size in metres.
out_folder (str): The output folder for the created images.
band_nums (List[int]): a list of band numbers to process. If empty all bands will be used
image_epsg (int): epsg number of the image to be used when missing in image
image_nodata (int): nodata value of the image to be used when missing in image
polygon_shapefile (str): a polygon shapefile used to cut up an image.
groupby (str): the column/field to use to group multiple features.
out_epsg (int): The epsg number representing the output coordinate system.
Returns:
List[str]: a list of created files.
"""
filename, ext = os.path.splitext(os.path.basename(image_file))
for ea_arg in [('image_file', image_file), ('out_folder', out_folder)]:
if ea_arg[1] is None or ea_arg[1].strip() == '':
raise ValueError('{} is required '.format(ea_arg[0]))
if ea_arg[1] is not None and not os.path.exists(ea_arg[1]):
raise IOError('{} does not exist-Got {}'.format(ea_arg[0], os.path.dirname(ea_arg[1])))
if not os.path.basename(out_folder) == os.path.basename(image_file).replace('.', '_'):
out_folder = os.path.join(out_folder, os.path.basename(image_file).replace('.', '_'))
if not os.path.exists(out_folder):
os.mkdir(out_folder)
with NamedTemporaryFile(prefix='{}_reproj_'.format(filename),
suffix='.tif', dir=TEMPDIR) as new_file:
reproj_image = os.path.normpath(new_file.name)
reproject_image(image_file, reproj_image, out_epsg, image_nodata=image_nodata,
image_epsg=image_epsg)
out_files = multi_block_bands_processing(reproj_image, pixel_size, out_folder, groupby=groupby,
band_nums=band_nums,
polygon_shapefile=polygon_shapefile)
if TEMPDIR in reproj_image and not config.get_debug_mode():
os.remove(reproj_image)
return out_files
[docs]def kmeans_clustering(raster_files, output_tif, n_clusters=3, max_iterations=500):
"""Create zones with k-means clustering from multiple raster files as described in
The input raster files should all:
- have the same pixel size
- be in the same coordinate system
- should overlap
Only the first band of each raster will be used.
The output TIFF image extent will be the minimum overlapping extent of the input images.
Each image will be resampled to a fixed coordinate to ensure pixels between images align.
Args:
raster_files (List[str]): The list of input raster files
output_tif (str): The output TIFF file
n_clusters (int): The number of clusters/zones to create.
max_iterations (int): Maximum number of iterations for k-means algorithm in a single run.
Returns:
pandas.core.frame.DataFrame: A dataframe containing cluster statistics for each image.
"""
if not isinstance(n_clusters, (int, long)):
raise TypeError('Size must be an Integer.')
if not isinstance(max_iterations, (int, long)):
raise TypeError('Size must be an Integer.')
if not isinstance(raster_files, list):
raise TypeError('Invalid Type: raster_files should be a list')
not_exists = [my_file for my_file in raster_files if not os.path.exists(my_file)]
if len(not_exists) > 0:
raise IOError('raster_files: {} raster file(s) do '
'not exist\n\t({})'.format(len(not_exists), '\n\t'.join(not_exists)))
temp_file_list = []
start_time = time.time()
images_combined, band_list = stack_and_clip_rasters(raster_files,
output_tif='1_kmeans_combined.tif')
temp_file_list.append(images_combined)
# Make sure ALL masks are saved inside the TIFF file not as a sidecar.
gdal.SetConfigOption('GDAL_TIFF_INTERNAL_MASK', 'YES')
step_time = time.time()
# Extract data ready for k-means ------------------------------------------------
with rasterio.open(images_combined) as src:
template_band = src.read(1, masked=True)
stack_meta = src.meta.copy()
bands = src.read(masked=True)
data_stack = []
for ea_band in bands:
new_band = np.ma.masked_values(ea_band, src.nodata)
data_stack.append(np.ma.compressed(ea_band))
data_stack = np.array(data_stack)
# Now we need to whiten and/or normalise the data prior to clustering
# This ensures each variable carries the same weight despite different input data ranges
norm_stack = np.array([stats.zscore(ea) for ea in data_stack])
if config.get_debug_mode():
# write mask to file
temp_file_list += [os.path.join(
TEMPDIR, '{}_kmeans_normalised_image.tif'.format(len(temp_file_list) + 1))]
with rasterio.open(temp_file_list[-1], 'w', **stack_meta) as tmp_dst:
for i, ea in enumerate(norm_stack, start=1):
# recreate the full array dimensions
new_band = np.empty_like(template_band)
np.place(new_band, ~new_band.mask, ea)
# reapply the mask to remove values assigned to nodata pixels
new_band = np.ma.masked_values(new_band, tmp_dst.nodata)
tmp_dst.update_tags(i, name=band_list[i - 1])
tmp_dst.write(new_band, i)
tmp_dst.descriptions = band_list
LOGGER.info('{:<30} {:<15} {dur}'.format('Images Normalised', '',
dur=timedelta(seconds=time.time() - step_time)))
step_time = time.time()
# Run the k-means clustering -------------------------------------------------------------------
_, code = kmeans2(data=norm_stack.T, k=n_clusters, iter=max_iterations, minit='points')
# adjust the class values to start from 1 so 0 can be used as nodata
code = code + 1
# Write to file
cluster_data = np.empty_like(template_band, dtype='int')
np.place(cluster_data, ~cluster_data.mask, code.T)
# set nodata to 0
cluster_data = np.ma.masked_values(cluster_data, 0)
cluster_dtype = rasterio.dtypes.get_minimum_dtype([0] + cluster_data)
cluster_meta = stack_meta.copy()
cluster_meta.update({'count': 1, 'nodata': 0, 'dtype': cluster_dtype})
with rasterio.open(output_tif, 'w', **cluster_meta) as dst:
dst.write(cluster_data.astype(cluster_dtype), 1)
LOGGER.info('{:<30} {:<15} {dur}'.format('Clustering complete', '',
dur=timedelta(seconds=time.time() - step_time)))
step_time = time.time()
# create summary statistics --------------------------------------------------------------------
# create a table to store statistics
results_df = pd.DataFrame(columns=['zone'])
with rasterio.open(output_tif) as src_clust, \
rasterio.open(images_combined) as src_img:
# get the list of clusters to process
clust_list = list(np.unique(src_clust.read()))
clust_list.remove(src_clust.nodata)
bands = src_img.read(masked=True)
for ea_clust in clust_list:
# get pixels for each cluster
clust_mask = np.where((src_clust.read(1, masked=True) == ea_clust), 1, 0)
new_row = pd.DataFrame([ea_clust], columns=['zone'])
img_meta = src_img.meta.copy()
with MemoryFile() as memfile:
# apply cluster mask to all bands
with memfile.open(**img_meta) as tmp_dst:
tmp_dst.write_mask(clust_mask.astype(img_meta['dtype']))
tmp_dst.write(bands)
with memfile.open() as tmp_src:
for i, ea_band in enumerate(tmp_src.read(masked=True)):
new_row[src_img.descriptions[i] + ', mean'] = np.nanmean(ea_band)
new_row[src_img.descriptions[i] + ', std'] = np.nanstd(ea_band)
# for pandas 0.23.4 add sort=False to prevent row and column orders to change.
try:
results_df = new_row.append(results_df, ignore_index=True, sort=False)
except TypeError:
results_df = new_row.append(results_df, ignore_index=True)
# Move 'Zone' to the first column - fixed in pandas 0.23.4 by adding sort=False to append
columns = list(results_df.columns)
columns.insert(0, columns.pop(columns.index('zone')))
results_df = results_df.reindex(columns=columns)
results_df.sort_values(by=['zone'], ascending=True, axis=0, inplace=True)
# write to csv without ', ' to assist when loading into ESRI
col_names = results_df.columns.str.replace(', ', '_').values
results_df.to_csv(output_tif.replace('.tif', '_statistics.csv'),
header=col_names, index=False)
# format the table and print to log.
results_df_copy = results_df.copy()
col_names = results_df_copy.columns.str.split(', ', expand=True).values
# replace column name NaNs to '....'. '....' as multiple spaces aren't allowed
results_df_copy.columns = pd.MultiIndex.from_tuples(
[('.......', x[0]) if pd.isnull(x[1]) else x for x in col_names])
LOGGER.info('\nCluster Statistics:\n{}\n'.format(
results_df_copy.to_string(justify='center', index=False)))
del results_df_copy
LOGGER.info('{:<30} {:<15} {dur:<15} {}'.format(
'Saved Stats CSV', '', output_tif.replace('.tif', '_statistics.csv'),
dur=timedelta(seconds=time.time() - step_time)))
# clean up of intermediate files
if len(temp_file_list) > 0 and not config.get_debug_mode():
for ea in temp_file_list:
for f in glob.glob(os.path.splitext(ea)[0] + '*'):
os.remove(f)
gdal.SetConfigOption('GDAL_TIFF_INTERNAL_MASK', None)
LOGGER.info('{:<30} {:<15} {dur:<15} {}'.format(
'K-Means Clustering Completed', '', '{} zones for {} rasters'.format(
n_clusters, len(raster_files)), dur=timedelta(seconds=time.time() - start_time)))
return results_df
[docs]def create_points_along_line(lines_geodataframe, lines_crs, distance_between_points,
offset_distance, out_epsg=0, out_points_shapefile=None,
out_lines_shapefile=None):
"""Add points along a line using a specified distance and create left/right parallel points
offset by a distance.
If the lines are in a geographic coordinate system they will be re-projected to a projected
coordinate system.
All touching lines will be treated as one.
MultiPart geometry will be converted to single part geometry.
The first and last points will be offset from start/end of the line evenly.
Attributes from the input lines will be lost.
line_crs is used to ensure that the correct wkt definition is maintained when using
geopandas.
Args:
lines_geodataframe (geopandas.geodataframe.GeoDataFrame): A Geopandas dataframe
containing Lines
lines_crs (pyprecag.crs.crs): The detailed coordinate system
distance_between_points (int): The separation distance between points.
offset_distance (int): The distance between the Strip point and parallel point.
out_epsg (int): Optionally specify the epsg number for the output coordinate system.
This should be a project coordinate system
out_points_shapefile (str): Optionally specify shapefile path and filename used to
save the points to. If a path is not supplied, it will save the file
to TEMPDIR by default.
out_lines_shapefile (str): Optionally specify shapefile path and filename used to save
the lines to. If a path is not supplied, it will save the file to
TEMPDIR by default
Returns:
geopandas.geodataframe.GeoDataFrame: The geodataframe containing the created points.
pyprecag.crs.crs: The coordinate system of both the points and lines geodataframe.
geopandas.geodataframe.GeoDataFrame: The geodataframe containing the created lines.
"""
if not isinstance(lines_geodataframe, GeoDataFrame):
raise TypeError('Invalid input data : inputGeodataFrame')
if 'LINE' not in ','.join(list(lines_geodataframe.geom_type.unique())).upper():
raise GeometryError('Invalid input data : A lines geopandas dataframe is required')
for argCheck in [('offset_distance', offset_distance),
('distance_between_points', distance_between_points)]:
if not isinstance(argCheck[1], (int, long, float)):
raise TypeError('{} must be a floating number.'.format(argCheck[0]))
if not isinstance(lines_crs, pyprecag_crs.crs):
raise TypeError('Crs must be an instance of pyprecag.crs.crs')
if not isinstance(out_epsg, (int, long)):
raise TypeError('out_epsg must be a integer - Got {}'.format(*out_epsg))
if out_points_shapefile is not None and os.path.isabs(out_points_shapefile):
if not os.path.exists(os.path.dirname(out_points_shapefile)):
raise IOError(
'Output directory {} does not exist'.format(os.path.dirname(out_points_shapefile)))
if out_lines_shapefile is not None and os.path.isabs(out_lines_shapefile):
if not os.path.exists(os.path.dirname(out_lines_shapefile)):
raise IOError(
'Output directory {} does not exist'.format(os.path.dirname(out_lines_shapefile)))
if out_points_shapefile is not None and out_lines_shapefile is not None:
if out_points_shapefile == out_lines_shapefile:
raise IOError('Output points and lines shapefile names are identical')
# create temp files template
with NamedTemporaryFile(prefix="strip_treatment_", suffix='.shp', dir=TEMPDIR) as new_file:
temp_filename = new_file.name
start_time = time.time()
step_time = time.time()
if out_epsg > 0:
# Make sure we get the correct details for the output coordinate system
points_crs = pyprecag_crs.crs()
points_crs.getFromEPSG(out_epsg)
# overwrite the gdf proj4 string with the epsg mapping equivalent to maintain the correct wkt.
lines_geodataframe.crs = lines_crs.epsg
if out_epsg > 0:
# Make sure we get the correct details for the output coordinate system
points_crs = pyprecag_crs.crs()
points_crs.getFromEPSG(out_epsg)
# input needs to be a projected coordinate system to work with metric distances
if lines_crs.srs.IsGeographic():
if out_epsg <= 0:
xmin, ymin, _, _ = lines_geodataframe.total_bounds
points_crs = pyprecag_crs.getProjectedCRSForXY(xmin, ymin, lines_crs.epsg_number)
else:
points_crs = lines_crs
# project if required.
if lines_crs.epsg_number != points_crs.epsg_number:
lines_geodataframe.to_crs(epsg=points_crs.epsg_number, inplace=True)
if config.get_debug_mode():
LOGGER.info('{:<30} {:<15} {dur}'.format(
'Reproject lines To epsg {}'.format(points_crs.epsg_number), '',
dur=timedelta(seconds=time.time() - step_time)))
step_time = time.time()
# merge touching lines
if len(lines_geodataframe) > 1:
gdf_lines = GeoDataFrame(geometry=[linemerge(lines_geodataframe.unary_union)],
crs=lines_geodataframe.crs)
else:
gdf_lines = lines_geodataframe[['geometry']].copy()
# convert multi part to single part geometry
gdf_lines = gdf_lines.explode()
if isinstance(gdf_lines, GeoSeries):
# geopandas 0.3.0 explode creates a geoseries so convert back to geodataframe
gdf_lines = GeoDataFrame(geometry=gdf_lines, crs=lines_geodataframe.crs)
# explode creates a multi index so flatten to single level
gdf_lines = gdf_lines.reset_index().drop(['level_0', 'level_1'], axis=1)
# assign a name to the index column
gdf_lines.index.name = 'FID'
if gdf_lines['geometry'][0].has_z:
gdf_lines['geometry'] = gdf_lines['geometry'].apply(lambda x: drop_z(x))
# Add TrialID Side, Length and startoffset
if 'TrialID' not in gdf_lines.columns:
gdf_lines.insert(0, 'TrialID', gdf_lines.index)
else:
gdf_lines['TrialID'] = gdf_lines.index
# Add Side, Length and startoffset attributes
gdf_lines['Strip_Name'] = 'Strip'
# find dangle length when the line isn't evenly divided by the point distance
# Calculate the length of each line
gdf_lines['length'] = gdf_lines['geometry'].length
# calculate a start offset to centre points along the line
gdf_lines['startoffset'] = (gdf_lines['geometry'].length % distance_between_points) / 2
# create a new dataframe for parallel lines
gdf_lrline = GeoDataFrame(columns=['FID', 'TrialID', 'Strip_Name', 'geometry'],
geometry='geometry', crs=gdf_lines.crs)
# create L/R lines for each centre line
for index, c_line_row in gdf_lines.iterrows():
c_line_geom = c_line_row['geometry'].simplify(0.5, preserve_topology=True)
line_bearing = point_to_point_bearing(c_line_geom.coords[-1],
c_line_geom.coords[0])
for ea_line in [('left', '{} Strip'.format(deg_to_8_compass_pts(line_bearing - 90))),
('right', '{} Strip'.format(deg_to_8_compass_pts(line_bearing + 90)))]:
side, value = ea_line
# update the geometry and TrialID.
parallel_line = c_line_geom.parallel_offset(offset_distance, side, resolution=16,
join_style=1, mitre_limit=5.0)
parallel_line = parallel_line.simplify(0.5, preserve_topology=True)
# One of the lines needs to be flipped
if side == 'right':
parallel_line = LineString(parallel_line.coords[::-1])
gdf_lrline = gdf_lrline.append({'TrialID': c_line_row['TrialID'],
'Strip_Name': value, 'geometry': parallel_line},
ignore_index=True)
gdf_lrline['FID'] = gdf_lrline.index
if config.get_debug_mode():
LOGGER.info('{:<30} {:<15} {dur}'.format(
'Parallel lines created', '', dur=timedelta(seconds=time.time() - step_time)))
step_time = time.time()
# Create an empty dataframe to store points in
gdf_points = GeoDataFrame(columns=['FID', 'TrialID', 'Strip_Name', 'PointID', 'geometry'],
geometry='geometry',
crs=gdf_lrline.crs)
# Loop through each centre line
for index_c, line_c in gdf_lines.iterrows():
distance = line_c['startoffset']
ptid = 1
while distance < line_c['length']:
# Add point along the centre line.
pt = line_c['geometry'].interpolate(distance)
# add it to the dataframe
gdf_points = gdf_points.append({'geometry': pt,
'TrialID': line_c['TrialID'],
'PointID': ptid,
'Strip_Name': line_c['Strip_Name'],
'DistOnLine': distance},
ignore_index=True)
# To add points to Offset lines, first find corresponding lines.
line_subset = gdf_lrline[gdf_lrline['TrialID'] == line_c['TrialID']]
for index_lr, line_lr in line_subset.iterrows():
# find the distance along the L/R line of the corresponding centre line point
dist_along_line = line_lr['geometry'].project(pt)
# Add a new point feature. Interpolate locates the xy based on the distance
# along the line.
gdf_points = gdf_points.append(
{'geometry': line_lr['geometry'].interpolate(dist_along_line),
'TrialID': line_c['TrialID'],
'PointID': ptid,
'Strip_Name': line_lr['Strip_Name'],
'DistOnLine': distance},
ignore_index=True)
distance += distance_between_points
ptid += 1
# add a feature identifier
gdf_points['FID'] = gdf_points.index
# combine to original centre line while only keeping common columns and calculate length
gdf_lines = pd.concat([gdf_lines, gdf_lrline], join='inner',
axis=0).sort_values(['TrialID', 'Strip_Name'])
gdf_lines['length'] = gdf_lines['geometry'].length
if out_lines_shapefile is not None or config.get_debug_mode():
if out_lines_shapefile is None:
save_geopandas_tofile(gdf_lines, temp_filename.replace('.shp', '_lines.shp'),
overwrite=True)
else:
save_geopandas_tofile(gdf_lines, out_lines_shapefile, overwrite=True)
if out_points_shapefile is not None or config.get_debug_mode():
if out_points_shapefile is None:
save_geopandas_tofile(gdf_points, temp_filename.replace('.shp', '_points.shp'),
overwrite=True)
else:
save_geopandas_tofile(gdf_points, out_points_shapefile, overwrite=True)
if config.get_debug_mode():
LOGGER.info('{:<30} {:>15} {dur}'.format(
'Create Points Along Line Completed', '',
dur=timedelta(seconds=time.time() - start_time)))
return gdf_points, points_crs, gdf_lines
[docs]def ttest_analysis(points_geodataframe, points_crs, values_raster, out_folder,
zone_raster='', control_raster='', size=5, create_graph=False):
"""Run a moving window t-test analysis for a strip trial as described in Lawes
and Bramley (2012).
Format of the points must be from the create_points_along_line tools.
All input rasters must be of the same coordinate system and pixel size and overlap with the
points.
Output statistics include:
controls_mean - row by row mean of the control columns
treat_diff - row by row difference between the treatment and controls_mean columns
av_treat_diff - calculate mean of values using a moving window using the treat_diff column
p_value - calculate p_value using a moving window using treatment and controls_mean columns
RI - Response Index using the treatment and controls_mean columns
Output Files include:
For each line and strip combination :
- png Map showing orientation of the line (start and ends)
- png set of graphs
- CSV file of derived statistics.
Reference:
Lawes RA, Bramley RGV. 2012. A Simple Method for the Analysis of On-Farm Strip Trials.
Agronomy Journal 104, 371-377.
Args:
points_geodataframe (geopandas.geodataframe.GeoDataFrame): points derived using
create_points_along_line
points_crs (pyprecag.crs.crs):the coordinate system for the points.
values_raster (str): a kriged raster containing the treatment values
out_folder (str): folder for output files.
zone_raster (str): a raster containing zones.
control_raster (str): a kriged raster of
size (int): the size used to calculate the moving window statistics.
create_graph (bool):
Returns:
pandas.core.frame.DataFrame: dataframe containing output statistics
"""
if not isinstance(points_geodataframe, GeoDataFrame):
raise TypeError('Invalid input data : inputGeodataFrame')
if 'POINT' not in ','.join(list(points_geodataframe.geom_type.unique())).upper():
raise GeometryError('Invalid input data : a points geopandas dataframe is required')
if not isinstance(points_crs, pyprecag_crs.crs):
raise TypeError('Crs must be an instance of pyprecag.crs.crs')
if out_folder is None or out_folder == '':
raise ValueError('Please specify an output folder')
if not os.path.exists(out_folder):
raise IOError('Output directory {} does not exist'.format(out_folder))
if not isinstance(size, int): # or size % 2 == 0 - odd numbers only
raise TypeError("Size {} should be an integer.".format(size))
if zone_raster is None:
zone_raster = ''
if control_raster is None:
control_raster = ''
not_exists = [ea for ea in [values_raster, control_raster, zone_raster] if ea and
not os.path.exists(ea)]
if len(not_exists) > 0:
raise IOError('raster files: {} raster file(s) '
'do not exist\n\t({})'.format(len(not_exists), '\n\t'.join(not_exists)))
del not_exists
missing = [ea for ea in ['TrialID', 'PointID', 'Strip_Name', 'DistOnLine']
if ea and ea not in points_geodataframe.columns]
if len(missing) > 0:
raise ValueError("columns not found - {}. Please use the output from the "
"create_points_along_line. (PAT's create strip trial points)"
" ".format(len(missing), ','.join(missing)))
del missing
# get list of valid rasters
raster_files = [ea for ea in [values_raster, control_raster, zone_raster] if ea]
# make sure rasters overlap ------------------------------------
try:
for i, ea_raster in enumerate(raster_files, start=1):
with rasterio.open(ea_raster) as src:
band1 = src.read(1, masked=True)
# get minimum data extent
data_window = get_data_window(band1)
# find the intersection of all the windows
if i == 1:
min_window = data_window
else:
# create a new window using the last coordinates based on this image in case
# extents/pixel origins are different
min_img_window = from_bounds(*min_bbox, transform=src.transform)
# find the intersection of the windows.
min_window = intersection(min_img_window, data_window).round_lengths('ceil')
# convert the window co coordinates
min_bbox = src.window_bounds(min_window)
del min_window
except rasterio.errors.WindowError as e:
# reword 'windows do not intersect' error message
if not e.args:
e.args = ('',)
e.args = ("Rasters do not overlap",)
raise # re-raise current exception
transect_column = 'Strip_Name'
x_axis_column = 'DistOnLine'
line_count = len(points_geodataframe['TrialID'].unique())
# Extract raster values for points ------------------------------------------------------------
gdf_points, points_crs = extract_pixel_statistics_for_points(points_geodataframe, points_crs,
raster_files, 'extract_pixels.csv',
size_list=[1])
gdf_points.index.name = 'rowID'
column_names = {}
# find the columns relating to the rasters
if values_raster != '':
rastname = os.path.splitext(os.path.basename(values_raster))[0]
column_names['Value'] = next(eaString for eaString in gdf_points.columns
if rastname in eaString)
if control_raster != '':
rastname = os.path.splitext(os.path.basename(control_raster))[0]
column_names['Control'] = next(eaString for eaString in gdf_points.columns
if rastname in eaString)
if zone_raster != '':
rastname = os.path.splitext(os.path.basename(zone_raster))[0]
column_names['Zone'] = next(eaString for eaString in gdf_points.columns
if rastname in eaString)
else:
# Add a zone col with a constant value
gdf_points['Zone'] = 1
column_names['Zone'] = 'Zone'
# rename columns from raster name to the dictionary key to be more generic
gdf_points.rename(columns=dict([[v, k] for k, v in column_names.iteritems()]), inplace=True)
# create a unique id for each line/point
gdf_points.insert(1, 'TrialPtID',
gdf_points.apply(lambda x: "'{}-{}'".format(x['TrialID'],
x['PointID']), axis=1))
# Apply bearing to all transects
for group_name, gdf_group in gdf_points.groupby(['TrialID', transect_column]):
gdf_group = gdf_group.copy()
gdf_group['geom2'] = gdf_group['geometry'].shift()
gdf_group['label_angle'] = gdf_group.dropna(subset=['geom2'], axis=0).apply(
lambda p: text_rotation(p['geometry'], p['geom2']), axis=1)
# update the main dataframe - needs matching indexes
gdf_points.loc[gdf_points.index.isin(gdf_group.index), 'label_angle'] = \
gdf_group[['label_angle']]
del group_name, gdf_group
offset_names = gdf_points['Strip_Name'].unique().tolist()
strip_name = offset_names.pop(offset_names.index('Strip'))
# pivot/transpose the table.
df_pivot = pd.DataFrame(gdf_points).pivot(index='TrialPtID', columns=transect_column,
values=column_names.keys())
# clean up un required columns
if control_raster != '': # only keep offset strips
dropcols = [('Control', strip_name)]
dropcols += [('Value', ea) for ea in offset_names]
df_pivot.drop(dropcols, axis=1, inplace=True)
# change the order before joining levels together
df_pivot.columns = df_pivot.columns.swaplevel(0, 1)
# Fix column headers
df_pivot.columns = df_pivot.columns.map(' '.join).str.strip()
''' Ensure zone columns are integer. cant be integer and contain NAN's so do it later.
TODO: In v0.24, you can now do df = df.astype(pd.Int32Dtype())
http://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html
df_pivot[column_names['Zone']] = df_pivot[column_names['Zone']].astype(pd.Int8Dtype())'''
# Prepare to join tables --------------------------------------------------------------------
# drop geometry etc. and create flat table.
df_table = pd.DataFrame(gdf_points.drop(columns='geometry', inplace=False))
# Keep only minimum of columns
dropcols = [ea for ea in df_table.columns if
ea not in ['FID', 'TrialPtID', 'TrialID', 'PointID', 'DistOnLine']]
df_table.drop(dropcols, axis=1, inplace=True)
# remove duplicate points only keeping first - ie the Strip
df_table.drop_duplicates(subset=['TrialPtID'], keep='first', inplace=True)
# add pivot columns back to original table --------------------------------------------------
df_table = pd.merge(df_table, df_pivot, on='TrialPtID')
pivot_columns = df_pivot.columns
del df_pivot
import matplotlib as mpl
# suppress ImportError: No module named _tkinter, please install the python-tk package
mpl.use('Agg')
import matplotlib.pyplot as plt
# Suppress interactive plotting
plt.ioff()
from matplotlib import gridspec
from collections import OrderedDict
import matplotlib.patheffects as pe
for iline, (line_id, gdf_strip) in enumerate(gdf_points.groupby(['TrialID']), start=1):
status = '{} of {}'.format(iline, line_count)
loop_time = time.time()
df_subtable = df_table[df_table['TrialID'] == line_id].copy()
offset_names = gdf_strip['Strip_Name'].unique().tolist()
strip_name = offset_names.pop(offset_names.index('Strip'))
# determine which columns to use
keepcols = [col for col in pivot_columns if len(col.split(' ')) == 2]
keepcols += [col for x in offset_names for col in pivot_columns if col.startswith(x)]
dropcols = [ea for ea in pivot_columns if ea not in keepcols]
column_names = defaultdict(list)
for ea in keepcols:
parts = ea.split(' ')
if len(parts) == 2:
column_names[parts[-1]].append(ea)
elif parts[-1] != 'Zone':
column_names['Control'].append(ea)
else:
column_names[parts[-1]].append(ea)
df_subtable.drop(dropcols, axis=1, inplace=True)
# Convert zones to integer - there should be no NAN's
df_subtable[column_names['Zone']] = df_subtable[column_names['Zone']].astype(int)
# run loop for the three scenarios. 1) both sides (N+S), 2) North, 3) South
scenarios = [offset_names] + offset_names
for iscenario, scenario in enumerate(scenarios, start=1):
if not isinstance(scenario, list):
scenario = [scenario]
control_col = [col for x in scenario for col in column_names['Control']
if col.startswith(x)]
valid_zone_col = [col for x in scenario + ['Strip'] for col in column_names['Zone']
if col.startswith(x)]
dropcols = [col for col in column_names['Control']
if col not in control_col + valid_zone_col + column_names['Value']]
dropcols += [col for col in column_names['Zone']
if col not in control_col + valid_zone_col + column_names['Value']]
offset_subst = '-'.join([ea.split(' ')[0] for ea in control_col])
file_path_noext = os.path.join(out_folder, "Trial-{}_{}-strip".format(line_id,
offset_subst))
df_statstable, control_mean = calculate_strip_stats(df_subtable.drop(columns=dropcols),
column_names['Value'][0],
control_col, size=size)
df_statstable.drop(columns=['FID', 'TrialPtID'], axis=1).to_csv(
file_path_noext + '.csv', index=False)
if config.get_debug_mode():
LOGGER.info('{:<30}\t{:>10} {dur:<15} {}'.format(
'Saved CSV', '', file_path_noext + '.csv',
dur=timedelta(seconds=time.time() - loop_time)))
''' Create the map & graphs ---------------------------------------------------------
https://stackoverflow.com/a/39694347
https://stackoverflow.com/a/14887119'''
markers = ['.', '^', '+', 'x', '*', 'o', ',', 'v', '<', '>', 's', 'd']
colour_list = ['r', 'g', 'b', 'y']
zone_column = 'Strip Zone'
# add plotting parameters to table -----------------------------------------------
# assign an id to the column
df_statstable['zone_UID'] = df_statstable.groupby(zone_column).grouper.group_info[0]
# assign a marker based on the index from the list
df_statstable["zone_marker"] = df_statstable['zone_UID'].apply(lambda x: markers[x])
# drop nan's introduced by the rolling window
df_statstable = df_statstable.dropna(subset=['av_treat_dif'], axis=0).copy()
# only get those points geometry with rolling window results.
gdf_map = pd.merge(gdf_strip, df_statstable[['p_value']],
on='TrialPtID').set_index('FID', drop=False) # ,'label_angle'
# get y axis limits
min_ylimit = df_statstable['av_treat_dif'].min(skipna=True)
max_ylimit = df_statstable['av_treat_dif'].max(skipna=True)
use_pvalue = 0.05
df_statstable['sig_color'] = df_statstable['p_value'].apply(
lambda x: 'r' if x > use_pvalue else 'k')
df_statstable['sig_label'] = df_statstable['p_value'].apply(
lambda x: 'not significant' if x > use_pvalue else 'significant')
# Create the map only once for both lines ----------------------------------------------
if iscenario == 1:
# get start middle and ends of lines for labelling
start_end_pts = gdf_map[gdf_map[transect_column] == 'Strip'].iloc[[0, -1]]
middle_pts = gdf_map[gdf_map['PointID'] ==
gdf_map.iloc[int(len(gdf_map) / 2)]['PointID']]
# create lines
# Aggregate points with the GroupBy - for shapely 1.3.2+ use the point objects
try:
gdf_lines = gdf_map.groupby(['Strip_Name'])['geometry'].apply(
lambda x: LineString(x.tolist()) if x.size > 1 else x.tolist())
except ValueError:
gdf_lines = gdf_map.groupby(['Strip_Name'])['geometry'].apply(
lambda x: LineString([(p.x, p.y) for p in x]))
gdf_lines = GeoDataFrame(gdf_lines, geometry='geometry')
fig_map, ax = plt.subplots(figsize=(5, 5))
# fix the extent to the vector
ax.set_aspect('equal')
ax.set_title('Map for Trial {}'.format(line_id), fontsize=8)
ax.set_xlim(left=gdf_strip.total_bounds[0] - 50,
right=gdf_strip.total_bounds[2] + 50)
ax.set_ylim(bottom=gdf_strip.total_bounds[1] - 50,
top=gdf_strip.total_bounds[3] + 50)
# plot the line
gdf_lines.plot(ax=ax, cmap='rainbow', legend=True)
# _ = gdf_map.plot(ax=ax,marker='o',column='Strip_Name',cmap='rainbow',markersize=5)
ax.tick_params(which='major', width=0.75, length=2.5, labelsize=4)
# ,top=True,labeltop=True,bottom=False,labelbottom=False)
plt.setp(ax.spines.values(), linewidth=0.5)
ax.get_xaxis().set_major_formatter(mpl.ticker.FuncFormatter(
lambda x, p: "{:,.0f}".format(x)))
ax.get_yaxis().set_major_formatter(mpl.ticker.FuncFormatter(
lambda x, p: "{:,.0f}".format(x)))
# plt.xticks(rotation='vertical')
anno_kwargs = {'size': 7, 'va': 'center', 'ha': 'center',
'path_effects': [pe.withStroke(linewidth=4, foreground="white")]}
middle_pts.apply(lambda x: ax.annotate(x['Strip_Name'],
xy=(x.geometry.x, x.geometry.y),
rotation=x['label_angle'], **anno_kwargs),
axis=1)
ax.annotate('Start', xy=(start_end_pts.iloc[0].geometry.x,
start_end_pts.iloc[0].geometry.y),
rotation=start_end_pts.iloc[0]['label_angle'], **anno_kwargs)
ax.annotate('End', xy=(start_end_pts.iloc[-1].geometry.x,
start_end_pts.iloc[-1].geometry.y),
rotation=start_end_pts.iloc[-1]['label_angle'], **anno_kwargs)
map_file = os.path.join(out_folder, "Trial-{}_map.png".format(line_id))
plt.savefig(map_file, dpi=300)
plt.close()
if config.get_debug_mode():
LOGGER.info('{:<30}\t{:>10} {dur:<15} {}'.format(
'Map Saved', '', map_file, dur=timedelta(seconds=time.time() - loop_time)))
# Create the graphs ----------------------------------------------
fig_graph = plt.figure(figsize=(15, 10)) # width, height
fig_graph.subplots_adjust(hspace=0, wspace=0.05)
gs = gridspec.GridSpec(3, 1, height_ratios=[3, 3, 1])
if config.get_debug_mode():
gs = gridspec.GridSpec(4, 1, height_ratios=[3, 3, 1, 3])
axs = []
for igs, ss in enumerate(gs):
if igs == 0:
axs.append(fig_graph.add_subplot(ss))
else:
axs.append(fig_graph.add_subplot(ss, sharex=axs[0]))
# add a title
axs[0].set_title('Strip trial analysis for Trial {} - {} strip'
.format(line_id, offset_subst, size, use_pvalue), fontsize=16)
df_statstable.plot(x='DistOnLine', y=column_names['Value'][0], marker='.', ax=axs[0],
label='Treatment') # ,colormap='rainbow', figsize=(25, 5))
df_statstable.plot(x='DistOnLine', y=control_mean, marker='.', ax=axs[0],
label='Control')
for name, group in df_statstable.groupby(['Strip Zone', 'zone_marker', 'sig_color',
'sig_label']):
group.plot.scatter(x='DistOnLine', y='av_treat_dif', marker=name[1], s=25,
c=name[2], ax=axs[1], zorder=2,
label='Zone {} {}'.format(name[0], name[3])) # ,s=25
group.plot.scatter(x='DistOnLine', y='RI', marker=name[1], s=25, c='k',
ax=axs[2], zorder=2,
label='Zone {}'.format(name[0])) # s=25
if config.get_debug_mode():
group.plot.scatter(x='DistOnLine', y='p_value', marker=name[1], s=25, c='k',
ax=axs[3], label='Zone {}'.format(name[0]), zorder=2)
axs[0].set(ylabel="Treatment Units")
axs[1].set(ylabel="Treatment Difference")
axs[2].set(ylabel="Response\nIndex")
if config.get_debug_mode():
axs[3].set(ylabel="p value")
for i, ea in enumerate([0.05, 0.01, 0.005, 0.001]):
axs[3].axhline(y=ea, color=colour_list[i], linestyle='-', alpha=0.8, zorder=1,
label='p={}'.format(ea))
for iax, ea_ax in enumerate(axs):
ea_ax.grid(True, which='major', axis='x')
ea_ax.set(xlabel="Distance (m) from start of strip (see map)")
# increase graph outline
plt.setp(ea_ax.spines.values(), linewidth=1)
# Shrink current axis by 20% https://stackoverflow.com/a/4701285
box = ea_ax.get_position()
ea_ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ea_ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# remove duplicates labels from legend https://stackoverflow.com/a/13589144
handles, labels = ea_ax.get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
if config.get_debug_mode() and iax == 1:
ea_ax.legend(by_label.values(), by_label.keys(), loc='center left',
bbox_to_anchor=(1, 0.5), edgecolor='w')
# title='Using a p-value of {} '.format(use_pvalue))
else:
ea_ax.legend(by_label.values(), by_label.keys(), loc='center left',
bbox_to_anchor=(1, 0.5), edgecolor='w')
plt.savefig(file_path_noext + '_graph.png', index=False)
plt.close()
if config.get_debug_mode():
LOGGER.info('{:<30}\t{:>10} {dur:<15} {}'.format(
'Saved graph', '', file_path_noext + '_graph.png',
dur=timedelta(seconds=time.time() - loop_time)))
LOGGER.info('{:<30} {:>15} {dur}'.format(
'T-Test for Line {} Completed'.format(line_id), status,
dur=timedelta(seconds=time.time() - loop_time)))
return df_table
[docs]def persistor_all_years(raster_files, output_tif, greater_than, target_percentage):
"""Determine the performance persistence of yield by across multiple years as described in
Bramley and Hamilton (2005)
The "Target over all years" method assigns a value to each pixel to indicate the number of
instances (in the raster list) in which that pixel was either less than or greater than
the mean (+/- a nominated percentage) of that raster.
All input rasters MUST overlap and have the same coordinate system and pixel size.
If a path is omitted from output_tif it will be created in your temp folder.
References:
Bramley RGV, Hamilton RP (2005) Understanding variability in winegrape production systems
1. Within vineyard variation in yield over several vintages. Australian Journal Of Grape And
Wine Research 10, 32-45. doi:10.1111/j.1755-0238.2004.tb00006.x.
Args:
raster_files (List[str]): List of rasters to use as inputs
output_tif (str): Output TIF file
greater_than (bool): if true test above (gt) the mean
target_percentage (int): the percent variation either above/below the mean.
This should be a integer between -50 and 50
Returns:
str: The output tif name
"""
if not isinstance(target_percentage, int):
raise TypeError('target_percentage must an integer between -50 to 50.')
if target_percentage < -50 or target_percentage > 50:
raise ValueError('target_percentage must an integer between -50 to 50.')
if not isinstance(greater_than, (int, bool)):
raise TypeError('greater_than must be boolean.')
if output_tif is not None and not os.path.isabs(output_tif):
output_tif = os.path.join(TEMPDIR, output_tif)
if not isinstance(raster_files, list):
raise TypeError('Invalid Type: raster_files should be a list')
if len(raster_files) == 0:
raise TypeError('Invalid Type: Empty list of raster files')
start_time = time.time()
# combine and align grid and slip to common area
image_combined, _ = stack_and_clip_rasters(raster_files, use_common=True,
output_tif='allyrs_combined.tif')
with rasterio.open(image_combined, 'r') as src:
kwargs = src.meta.copy()
for i, band in enumerate(src.read(masked=True), start=1):
cutoff = np.nanmean(band) + (np.nanmean(band) * (target_percentage / 100.00))
band_new = band.copy()
# classify bands
if greater_than:
band_new = np.where(band > cutoff, 1, 0)
else:
band_new = np.where(band < cutoff, 1, 0)
# reapply nodata values
band_new[np.where(band.mask)] = -9999
band_new = band_new.astype(np.int16)
# cumulative sum of bands
if i == 1:
sum_of_rasters = band_new
else:
sum_of_rasters = sum_of_rasters + band_new
sum_of_rasters[band.mask] = -9999
del band, band_new
kwargs.pop('count')
kwargs.update({'nodata': -9999, 'dtype': np.int16})
with rasterio.open(output_tif, 'w', count=1, **kwargs) as dst_allyrs:
dst_allyrs.write(sum_of_rasters.astype(np.int16), 1)
arg_str = '{} {}%'.format('>' if greater_than else '<', target_percentage)
LOGGER.info('{:<30} {:>15} {dur}'.format('Persistor All Years Completed', arg_str,
dur=timedelta(seconds=time.time() - start_time)))
# cleanup Temp files
if not config.get_debug_mode() and TEMPDIR in image_combined:
os.remove(image_combined)
return output_tif
[docs]def persistor_target_probability(upper_raster_files, upper_percentage, upper_probability,
lower_raster_files, lower_percentage, lower_probability,
output_tif):
"""Determine the probability of a performance being exceeded or not being met with an upper and
lower limit as described in Bramley and Hamilton (2005).
The "Target probability" method builds on the target over all years method, in that it includes
an upper range (i.e. cells with a given frequency of values that are above the mean +/- a
given percentage) and a lower range (i.e. cells with a given frequency of values that are
below the mean +/- a given percentage).
A value is assigned to each pixel which indicates whether the performance in that pixel over
a given proportion of years is:
a) Greater than the mean plus or minus the nominated percentage (value = 1)
b) Less than the mean plus or minus the nominated percentage (value = -1)
The remaining pixels which do not fall into category a) or b) are given a value of 0.
All input rasters MUST overlap and have the same coordinate system and pixel size.
If a path is omitted from output_tif it will be created in your temp folder.
References:
Bramley RGV, Hamilton RP (2005) Understanding variability in winegrape production systems
1. Within vineyard variation in yield over several vintages. Australian Journal Of Grape
And Wine Research 10, 32-45. doi:10.1111/j.1755-0238.2004.tb00006.x.
Args:
upper_raster_files (List[str]): List of rasters to used for the analysis of the
UPPER category
upper_percentage (int): the percent variation either above/below the mean to apply
to the UPPER raster category.
upper_probability (int):the probability percentage to apply to the LOWER category
lower_raster_files (List[str]): List of rasters to used for the analysis of the
LOWER category
lower_percentage (int): the percent variation either above/below the mean to apply
to the LOWER raster category.
lower_probability (int): the probability percentage to apply to the LOWER category
output_tif (str): Output TIF file
"""
for arg_check in [('upper_raster_files', upper_raster_files),
('lower_raster_files', lower_raster_files)]:
if not isinstance(arg_check[-1], list):
raise TypeError('Invalid Type: {} should be a list'.format(arg_check[0]))
if len(arg_check[-1]) == 0:
raise TypeError('Invalid Type: {} is a empty list'.format(arg_check[0]))
for arg_check in [('upper_percentage', upper_percentage),
('upper_probability', upper_probability),
('lower_percentage', lower_percentage),
('lower_probability', lower_probability)]:
if not isinstance(arg_check[1], int):
raise TypeError('{} must an integer'.format(arg_check[0]))
for arg_check in [('upper_percentage', upper_percentage),
('lower_percentage', lower_percentage)]:
if not isinstance(arg_check[1], int):
raise ValueError('{} must an integer between -50 to 50.'.format(arg_check[0]))
if output_tif is None or output_tif == '':
raise TypeError('Please specify an output filename')
if not os.path.exists(os.path.dirname(output_tif)):
raise IOError('Output directory {} does not exist'.format(os.path.dirname(output_tif)))
start_time = time.time()
out_prefix, out_ext = os.path.splitext(os.path.basename(output_tif))
# Process upper ------------------------------------------------------------------------------
temp_file_list = [os.path.join(TEMPDIR, out_prefix + '_1_upperyears' + out_ext)]
upper_tif = persistor_all_years(upper_raster_files,
temp_file_list[-1],
greater_than=True,
target_percentage=upper_percentage)
upper_cutoff = (upper_probability / 100.00) * len(upper_raster_files)
with rasterio.open(upper_tif, 'r') as src:
kwargs = src.meta.copy()
band = src.read(masked=True)
# Greater than test returns boolean which can be converted to 0,1
data_u = np.where(~band.mask, (band >= upper_cutoff).astype(np.int16), -9999)
if config.get_debug_mode():
temp_file_list += [os.path.join(TEMPDIR, out_prefix + '_2gt_upperprob' + out_ext)]
with rasterio.open(temp_file_list[-1], 'w', **kwargs) as dst:
dst.write(data_u)
del band
# Process lower ------------------------------------------------------------------------------
temp_file_list += [os.path.join(TEMPDIR, out_prefix + '_1_loweryears' + out_ext)]
lower_tif = persistor_all_years(lower_raster_files,
temp_file_list[-1],
greater_than=False,
target_percentage=lower_percentage)
lower_cutoff = (lower_probability / 100.00) * len(lower_raster_files)
with rasterio.open(lower_tif, 'r') as src:
band = src.read(masked=True)
# (band >= lower_cutoff) returns booleans (0,1) we need (0,-1) so
# convert to integer and use np.negative to switch it while maintaining the nodata values.
data_l = np.where(~band.mask, np.negative((band >= lower_cutoff).astype(np.int16)), -9999)
if config.get_debug_mode():
temp_file_list += [os.path.join(TEMPDIR, out_prefix + '_2gtn_lowerprob' + out_ext)]
with rasterio.open(temp_file_list[-1], 'w', **kwargs) as dst:
dst.write(data_l)
with rasterio.open(output_tif, 'w', **kwargs) as dst:
result = np.where(~band.mask, data_l + data_u, -9999)
dst.write(result.astype(np.int16))
LOGGER.info('{:<30} {:>15} {dur}'.format('Persistor Target Probability Completed', '',
dur=timedelta(seconds=time.time() - start_time)))
# clean up of intermediate files
if len(temp_file_list) > 0 and not config.get_debug_mode():
for ea in temp_file_list:
for f in glob.glob(os.path.splitext(ea)[0] + '*'):
os.remove(f)
return