import pandas as pd
import geopandas as gpd

survey_gdf = gpd.read_file('sample_survey_fields_geo.geojson')
survey_gdf.head()


import geemap.foliumap, ee

try:
        ee.Initialize()
except Exception as e:
        ee.Authenticate()
        ee.Initialize()


plot_a = survey_gdf[survey_gdf['plot'] == 'plot_a'].loc[:,('unique_id','geometry')]
plot_b = survey_gdf[survey_gdf['plot'] == 'plot_b'].loc[:,('unique_id','geometry')]
plot_c = survey_gdf[survey_gdf['plot'] == 'plot_c'].loc[:,('unique_id','geometry')]


ee_plot_a = geemap.geopandas_to_ee(plot_a).geometry()
center_lat = -14.305434
center_lon = 34.557480

# this is not a real plot from any survey data; it was randomly selected for demonstration
map_plot_a = geemap.foliumap.Map(center=[center_lat,center_lon],zoom=15)
map_plot_a.add_basemap('SATELLITE')
map_plot_a.addLayer(ee_plot_a,vis_params={'color':'a4c639'},opacity=0.8)
map_plot_a


ee_plot_b = geemap.geopandas_to_ee(plot_b).geometry()
center_lat = -11.566691
center_lon = 33.932340

# this is not a real plot from any survey data; it was randomly selected for demonstration
map_plot_b = geemap.foliumap.Map(center=[center_lat,center_lon],zoom=15)
map_plot_b.add_basemap('SATELLITE')
map_plot_b.addLayer(ee_plot_b,vis_params={'color':'a4c639'},opacity=0.8)
map_plot_b


ee_plot_c = geemap.geopandas_to_ee(plot_c).geometry()
center_lat = -13.600627
center_lon = 33.339620

# this is not a real plot from any survey data; it was randomly selected for demonstration
map_plot_c = geemap.foliumap.Map(center=[center_lat,center_lon],zoom=15)
map_plot_c.add_basemap('SATELLITE')
map_plot_c.addLayer(ee_plot_c,vis_params={'color':'a4c639'},opacity=0.8)
map_plot_c


# Project to appropriate CRS
survey_gdf.to_crs('EPSG:20936',inplace=True)
survey_gdf.crs

<Projected CRS: EPSG:20936>
Name: Arc 1950 / UTM zone 36S
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Malawi. Zambia and Zimbabwe - east of 30°E.
- bounds: (30.0, -22.42, 35.93, -8.19)
Coordinate Operation:
- name: UTM zone 36S
- method: Transverse Mercator
Datum: Arc 1950
- Ellipsoid: Clarke 1880 (Arc)
- Prime Meridian: Greenwich


survey_gdf['plot_area'] = survey_gdf.area * 0.0002471054
survey_gdf


survey_gdf = survey_gdf.to_crs(4326)
survey_gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich


# define codes which identify maize
maize_codes = [1,2,3,4]
# identify True/False whether crop code columns have maize codes; sum these True/False values to identify if any contained maize
survey_gdf['maize_pos'] = survey_gdf.loc[:,('crop_code_a','crop_code_b','crop_code_c','crop_code_d','crop_code_e')].isin(maize_codes).sum(axis=1)
survey_gdf


# Obtain level0 boundaries, filter for Malawi, convert to geometry
malawi_geom = ee.Feature(ee.FeatureCollection("FAO/GAUL/2015/level0").filter(ee.Filter.eq('ADM0_NAME', 'Malawi')).geometry())

# Visualize the area of Malawi we created
center_lat = -13.305434
center_lon = 34.557480

map_malawi = geemap.foliumap.Map(center=[center_lat,center_lon],zoom=6)
map_malawi.add_basemap('SATELLITE')
map_malawi.addLayer(malawi_geom,vis_params={'color':'a4c639'},opacity=0.8)
map_malawi


# import Azzari et al code
from eetc import gee_tools
import gee_tools.datasources.sentinel2_2a as s2_2a


# Obtain ImageCollection for specified area and dates with vegetation indices (addVIs) and cloud masks defined (addCloudMasks)
# code from Azzari et al.
s2coll_2a = s2_2a.Sentinel2SR(malawi_geom.geometry(), start_date = ee.Date('2018-11-20'), end_date = ee.Date('2019-04-20'), addVIs=True, addCloudMasks=False).get_img_coll()


for band in s2coll_2a.first().getInfo()['bands']:
    print(band['id'])

AEROS
BLUE
GREEN
RED
RDED1
RDED2
RDED3
NIR
RDED4
VAPOR
SWIR1
SWIR2
AOT
WVP
SCL
TCI_R
TCI_G
TCI_B
MSK_CLDPRB
MSK_SNWPRB
QA10
QA20
QA60
NBR1
NBR2
STI
NDTI
CRC
REIP
GCVI
RDGCVI1
RDGCVI2
MTCI
MTCI2
WDRVI
GRWDRVI
RDWDRVI
RDNDVI1
RDNDVI2
NDVI
SNDVI


# Use Azzari et al's function to mask clouds
def maskClouds_sr(img, bandnames):
    scl = img.select(['SCL'])
    clear = scl.updateMask(scl.eq(4) or (scl.eq(5)))
    img = img.updateMask(clear)
    return img.select(bandnames)


# Define the bandames in the S2-2A data which we want to keep
bandnames = ['AEROS','BLUE','GREEN','RED','RDED1','RDED2','RDED3','NIR','RDED4','VAPOR','SWIR1','SWIR2','NBR1','NDTI','GCVI','NDVI','SNDVI']
# Mask clouds using the function above
s2coll_2a = s2coll_2a.map(lambda img: maskClouds_sr(img, bandnames))


for band in s2coll_2a.first().getInfo()['bands']:
    print(band['id'])

AEROS
BLUE
GREEN
RED
RDED1
RDED2
RDED3
NIR
RDED4
VAPOR
SWIR1
SWIR2
NBR1
NDTI
GCVI
NDVI
SNDVI


# this is not a real plot from any survey data; it was randomly selected for demonstration
center_lat = -15.783006
center_lon = 35.437013

gcvi_samp = s2coll_2a.select('GCVI').filterBounds(ee.Geometry.Point([center_lon,center_lat]).buffer(2000))

map_point_g = geemap.foliumap.Map(center=[center_lat,center_lon],zoom=11)
map_point_g.add_basemap('SATELLITE')
# set color range to use for GCVI levels: black is lowest, beginning at 0, through green as the highest, ending at 5
gcvi_params = {'min': 0, 'max': 5, 'palette': ['black','red', 'orange', 'yellow', 'green']}

map_point_g.addLayer(gcvi_samp, gcvi_params)
map_point_g


sndvi = s2coll_2a.map(lambda img: img.expression(
    '(NIR - RED) / (NIR + RED + 0.16)', {
        'NIR': img.select('NIR'),
        'RED': img.select('RED')
    }).rename('SNDVI'))


# obtain extended timeframe for S2 SA imagery, if desired
s2coll_2a_ext = s2_2a.Sentinel2SR(malawi_geom.geometry(), start_date = ee.Date('2018-11-20'), end_date = ee.Date('2019-09-01'), addVIs=True, addCloudMasks=False).get_img_coll()
# Define the bandames in the S2-2A data which we want to keep
bandnames = ['AEROS','BLUE','GREEN','RED','RDED1','RDED2','RDED3','NIR','RDED4','VAPOR','SWIR1','SWIR2','NBR1','NDTI','GCVI','NDVI','SNDVI']
# Mask clouds
s2coll_2a_ext = s2coll_2a_ext.map(lambda img: maskClouds_sr(img, bandnames))


# create point_g as earth engine point of interest for which to pull GCVI values
# this is not a real plot from any survey data; it was randomly selected for demonstration
ee_point_g = ee.Geometry.Point(35.437013,-15.783006)


map_point_g = geemap.foliumap.Map(center=[-15.783006, 35.437013],zoom=14)

map_point_g.add_basemap('SATELLITE')
map_point_g.addLayer(ee_point_g, vis_params={'color':'a4c639'})
map_point_g


# create list of all image timestamps
def create_timestamp_list(imgcoll): 
    """
    Create list of timestamps for all Images in ImageCollection
    Inputs:
        imgcoll (ImageCollection): ImageCollection to be processed
    Returns (list): list of all timestamps of Images
    """
    ts_list = ee.List(imgcoll \
    .aggregate_array('system:time_start')) \
    .map(lambda time_start: 
         ee.Date(time_start).format('Y-MM-dd HH:mm:ss')
    ) \
    .getInfo()

    return ts_list

# create list of all image indices; attach suffix to match
# band name format which will be applied once .toBands() is applied
def create_band_list(imgcoll, suffix):
    """
    Create list of all indices for Images in ImageCollection
    Inputs:
        imgcoll (ImageCollection): ImageCollection to be processed
        suffix (str): string in the form '_' + bandname (eg, '_GCVI')
            to add to each Image system:index. This is done to match the
            naming convention which will be used in .toBands() to convert
            ImageCollection into single Image
    Returns (list): list of all bands which will be in Image
        created from ImageCollection.toBands()
    """
    band_list = ee.List(imgcoll \
    .aggregate_array('system:index')) \
    .getInfo()

    band_list = [band + suffix for band in band_list]

    return band_list

def sample_lists(list, every_x):
    """
    Reduces list size to every x-th item, maintaining temporal order
    Inputs:
        list (list): list of dates or bands to sample
        every_x (int): specifies how many items from list to keep
    Returns (list): sample of original list
    """
    # create list of every xth item from list
    list_samp = list[0::every_x]
    
    return list_samp

# create dataframe with pixel values and timestamps
def extract_pixel_vals_dates(imgcoll, band, ee_point, \
    band_list=None, time_list=None, sample=None):
    """
    Extract the pixel values and timestamp for the specified band of 
        every image in an ImageCollection.
    Inputs:
        imgcoll (ImageCollection): ImageCollection containing band of interest
            over time range
        band (str): name of band to find values for from ImageCollection
        ee_point (ee.Geometry.Point): point location for which to find band values
        band_list (list or None): if same ImageCollection has already been processed,
            can use created band_list of unmasked Images to avoid re-checking all
            Images for clouds. If None, band_list is created as list of every band in
            the Image created from original ImageCollection
        time_list (list or None): if same ImageCollection has already been processed,
            can use created time_list of unmasked Images to avoid re-checking all
            Images for clouds. If None, time_list is created as list of timestamp
            for every band in the Image created from original ImageCollection
        sample (int or None): if provided, sample ImageCollection Images to only
            process every x-th image. If None, full ImageCollection is processed.
    Returns (DataFrame): Pandas DataFrame containing timestamp, band values, and band_ids
        for each unmasked pixel
    """

    # extract band of interest from image collection
    print('selecting band of interest', band,'from imagecollection')
    band_coll = imgcoll.select(band)

    # filter imagecollection for only images containing point
    print('filtering for only images containing point')
    point_coll = band_coll.filterBounds(ee_point)

    if not band_list:
        # create list of images in point_coll
        suffix = "_" + band
        print('adding suffix',suffix,'to band name and creating list of bands')
        band_list = create_band_list(point_coll, suffix)
    print(
    """
    number of ids in list:   {}
    
    first 5 ids in list:     {}
    """.format(
        len(band_list),
        band_list[0:5]
    ))

    if not time_list:
        # create list of timestamps for images in point_coll
        print('creating list of timestamps')
        time_list = create_timestamp_list(point_coll)
    print(
    """
    number of dates in list:   {}

    first 5 dates in list:     {}
    """.format(
        len(time_list),
        time_list[0:5]
    ))

    if sample:
        # sample band and time list to only process some images
        print('generating sample of images to process')
        band_list = sample_lists(band_list, sample)
        time_list = sample_lists(time_list, sample)

    # collapse point_coll into single image with multiple bands
    print('creating single image from imagecollection')
    point_img = point_coll.toBands()

    # create empty dictionary to store timestamps and values in
    time_pixel_vals = pd.DataFrame(columns=['datetime',band,'band_id'])
    print('created empty dataframe',time_pixel_vals.head())

    # loop through list we created to process each band
    for idx, band_id in enumerate(band_list):
        # select band by id
        img = point_img.select(band_id)
        if img.mask().reduceRegion(ee.Reducer.first(),ee_point).getInfo()[band_id] != 0:
            # obtain the start time associated with the current band
            time_start = time_list[idx]
            print('getting pixel values for img from',time_start,'...')
            
            # extract values for band as numpy array
            pixel_vals = img.reduceRegion(ee.Reducer.first(),ee_point).getInfo()[band_id]

            # add to dictionary with time as key and array as value
            time_pixel_vals = time_pixel_vals.append({'datetime':time_start,\
                band:pixel_vals,'band_id':band_id},ignore_index=True)

        else:
            print('masked pixel, moving on to next image')    
    
    return time_pixel_vals


# use function to obtain GCVI values for point f over time
# NOTE: this will take time to run, especially with 50+ Images
gcvi_time_pixel_vals = extract_pixel_vals_dates(s2coll_2a_ext, 'GCVI', ee_point_g)

selecting band of interest GCVI from imagecollection
filtering for only images containing point
adding suffix _GCVI to band name and creating list of bands

    number of ids in list:   52
    
    first 5 ids in list:     ['20181217T073311_20181217T075134_T36LYH_GCVI', '20181222T073319_20181222T074542_T36LYH_GCVI', '20181227T073321_20181227T075136_T36LYH_GCVI', '20190101T073319_20190101T075335_T36LYH_GCVI', '20190106T073301_20190106T075124_T36LYH_GCVI']
    
creating list of timestamps

    number of dates in list:   52

    first 5 dates in list:     ['2018-12-17 07:54:18', '2018-12-22 07:54:22', '2018-12-27 07:54:20', '2019-01-01 07:54:24', '2019-01-06 07:54:22']
    
creating single image from imagecollection
created empty dataframe Empty DataFrame
Columns: [datetime, GCVI, band_id]
Index: []
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
getting pixel values for img from 2019-01-31 07:54:27 ...
masked pixel, moving on to next image
getting pixel values for img from 2019-02-10 07:54:27 ...
masked pixel, moving on to next image
masked pixel, moving on to next image
getting pixel values for img from 2019-02-25 07:54:22 ...
masked pixel, moving on to next image
masked pixel, moving on to next image
getting pixel values for img from 2019-03-12 07:54:24 ...
masked pixel, moving on to next image
getting pixel values for img from 2019-03-22 07:54:27 ...
getting pixel values for img from 2019-03-27 07:54:26 ...
getting pixel values for img from 2019-04-01 07:54:29 ...
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
getting pixel values for img from 2019-04-21 07:54:33 ...
getting pixel values for img from 2019-04-26 07:54:30 ...
masked pixel, moving on to next image
masked pixel, moving on to next image
getting pixel values for img from 2019-05-11 07:54:35 ...
masked pixel, moving on to next image
masked pixel, moving on to next image
getting pixel values for img from 2019-05-31 07:54:35 ...
masked pixel, moving on to next image
getting pixel values for img from 2019-06-10 07:54:34 ...
masked pixel, moving on to next image
getting pixel values for img from 2019-06-20 07:54:35 ...
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image
masked pixel, moving on to next image


# visualize first five rows of created dataset
gcvi_time_pixel_vals.head()


# save pixel values to file for easier repeated use
gcvi_time_pixel_vals.to_csv('gcvi_time_pixel_vals_ext_pointg.csv',index=False)


from datetime import date

# convert to datetime type
gcvi_time_pixel_vals['datetime'] = pd.to_datetime(gcvi_time_pixel_vals['datetime'])

# create date field from datetime
gcvi_time_pixel_vals['date'] = gcvi_time_pixel_vals['datetime'].dt.date
# group by date and take mean GCVI for each day
gcvi_day_pixel_df = gcvi_time_pixel_vals.groupby('date')['GCVI'].mean().reset_index()
# view newly created dataset by day
gcvi_day_pixel_df.head()


import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
sns.set(style='whitegrid')

f,ax=plt.subplots(1,1, figsize=(12,8))

line = sns.lineplot(x='date',y='GCVI',data=gcvi_day_pixel_df,ax=ax)
ax.set_title('GCVI for Point over Time Frame')

ax.set_ylabel('GCVI')
ax.set_xlabel('Date')

Text(0.5, 0, 'Date')


# Import code we'll use from Azzari et al.
from eetc.gee_tools import harmonics

# Filter for only our index of interest in this example, GCVI
gcvi = s2coll_2a.select('GCVI')
# Specify band in the ImageCollection we want to use in regression
dep_bands = ['GCVI']

# Run the regression to produce an image with our terms as bands; using first day of growing season as refdate
hrmregr_coefs_gcvi = harmonics.run_std_regressions(gcvi,dep_bands,refdate='2018-10-20')


# Cycle through the bands to see our result
for band in hrmregr_coefs_gcvi.getInfo()['bands']:
    print(band['id'])

GCVI_sin2
GCVI_cos2
GCVI_sin1
GCVI_cos1
GCVI_t
GCVI_constant
GCVI_variance
GCVI_count
GCVI_mean
GCVI_rmse
GCVI_r2


# Filter for only our index of interest in this example, GCVI
gcvi_ext = s2coll_2a_ext.select('GCVI')
# Specify band in the ImageCollection we want to use in regression
dep_bands = ['GCVI']

# Run the regression to produce an image with our terms as bands; using first day of growing season as refdate
hrmregr_coefs_gcvi_ext = harmonics.run_std_regressions(gcvi_ext,dep_bands,refdate='2018-10-20')


# create new, smoothed GCVI values by fitting the original to the harmonic regression with coefficients created above
hrmregr_fit_gcvi_ext = harmonics.fit_harmonics(hrmregr_coefs_gcvi_ext, gcvi_ext, omega=1,nharmonics=2,bands=dep_bands,refdate='2018-10-20')


# generate lists of known images where our location is not masked
def create_lists_for_extract(df, suffix_to_remove, suffix_to_add):
    """
    Create list of band ids and timestamps from previously-generated
        DataFrame from extract_pixel_vals_dates(). 
    Inputs:
        df (DataFrame): DataFrame returned from extract_pixel_vals_dates()
        suffix_to_remove (str): original suffix added to Image system:index, 
            in the format of '_' + bandname
        suffix_to_add (str): new suffix to add to Image system:index in format
            '_' + new bandname
    Returns (list, list): list of timestamps, list of band ids as tuple
    """
    time_list = df['datetime'].to_list()
    band_list = df['band_id'].to_list()
    band_list = [band.replace(suffix_to_remove, '') for band in band_list]
    band_list = [band + suffix_to_add for band in band_list]

    return time_list, band_list


time_list, band_list = create_lists_for_extract(gcvi_time_pixel_vals, '_GCVI','_GCVI_HARMFIT')
band_list[0:5]

['20190131T073139_20190131T074730_T36LYH_GCVI_HARMFIT',
 '20190210T073039_20190210T074611_T36LYH_GCVI_HARMFIT',
 '20190225T072901_20190225T075255_T36LYH_GCVI_HARMFIT',
 '20190312T073019_20190312T074735_T36LYH_GCVI_HARMFIT',
 '20190322T072619_20190322T074833_T36LYH_GCVI_HARMFIT']


# use our function to extract values for each pixel
# NOTE: this will be VERY slow, much slower than the for the original values even using pre-filtered lists - choose a smaller time frame to minimize processing
# using Google Earth Engine to perform these calculations and extract as a .csv is also an option for faster processing
gcvi_hrmregr_time_pixel_vals = extract_pixel_vals_dates(hrmregr_fit_gcvi_ext, 'GCVI_HARMFIT', ee_point_g,band_list=band_list, time_list=time_list)

selecting band of interest GCVI_HARMFIT from imagecollection
filtering for only images containing point

    number of ids in list:   13
    
    first 5 ids in list:     ['20190131T073139_20190131T074730_T36LYH_GCVI_HARMFIT', '20190210T073039_20190210T074611_T36LYH_GCVI_HARMFIT', '20190225T072901_20190225T075255_T36LYH_GCVI_HARMFIT', '20190312T073019_20190312T074735_T36LYH_GCVI_HARMFIT', '20190322T072619_20190322T074833_T36LYH_GCVI_HARMFIT']
    

    number of dates in list:   13

    first 5 dates in list:     [Timestamp('2019-01-31 07:54:27'), Timestamp('2019-02-10 07:54:27'), Timestamp('2019-02-25 07:54:22'), Timestamp('2019-03-12 07:54:24'), Timestamp('2019-03-22 07:54:27')]
    
creating single image from imagecollection
created empty dataframe Empty DataFrame
Columns: [datetime, GCVI_HARMFIT, band_id]
Index: []
getting pixel values for img from 2019-01-31 07:54:27 ...
getting pixel values for img from 2019-02-10 07:54:27 ...
getting pixel values for img from 2019-02-25 07:54:22 ...
getting pixel values for img from 2019-03-12 07:54:24 ...
getting pixel values for img from 2019-03-22 07:54:27 ...
getting pixel values for img from 2019-03-27 07:54:26 ...
getting pixel values for img from 2019-04-01 07:54:29 ...
getting pixel values for img from 2019-04-21 07:54:33 ...
getting pixel values for img from 2019-04-26 07:54:30 ...
getting pixel values for img from 2019-05-11 07:54:35 ...
getting pixel values for img from 2019-05-31 07:54:35 ...
getting pixel values for img from 2019-06-10 07:54:34 ...
getting pixel values for img from 2019-06-20 07:54:35 ...


# save as csv to avoid repeating the processing
gcvi_hrmregr_time_pixel_vals.to_csv('gcvi_hrmregr_ext_time_pixel_vals_pointg.csv',index=False)


# convert to datetime type
gcvi_hrmregr_time_pixel_vals['datetime'] = pd.to_datetime(gcvi_hrmregr_time_pixel_vals['datetime'])

# extract date from datetime
gcvi_hrmregr_time_pixel_vals['date'] = gcvi_hrmregr_time_pixel_vals['datetime'].dt.date
# group data by date, take mean of all GCVI_HARMFIT values for each date
gcvi_hrmregr_day_pixel_df = gcvi_hrmregr_time_pixel_vals.groupby('date')['GCVI_HARMFIT'].mean().reset_index()
gcvi_hrmregr_day_pixel_df.head()


gcvi_hrmregr_day_pixel_df['date_ordinal'] = pd.to_datetime(gcvi_hrmregr_day_pixel_df['date']).apply(lambda date: date.toordinal())
gcvi_day_pixel_df['date_ordinal'] = pd.to_datetime(gcvi_day_pixel_df['date']).apply(lambda date: date.toordinal())

gcvi_hrmregr_day_pixel_df.head()


from scipy.interpolate import interp1d
import numpy as np

f, ax = plt.subplots(1,1,figsize=(15,10))

# obtain the original dates and gcvi values for plot f to use as comparison to our smoothed values
x_orig = gcvi_day_pixel_df['date_ordinal']
y_orig = gcvi_day_pixel_df['GCVI']

# obtain plot f dates and smoothed GCVI values to use as our x and y values
x = gcvi_hrmregr_day_pixel_df['date_ordinal']
y = gcvi_hrmregr_day_pixel_df['GCVI_HARMFIT']

# create 500 evenly spaced numbers over the range of our ordinal dates
xnew = np.linspace(x.min(),x.max(),500)

# approximate our function from the date and ndvi values
f = interp1d(x,y,kind='cubic')
# create y values, using our approximated function, to pair with our 500 x values 
y_smooth=f(xnew)

#plot the smoothed gcvi points from our dataset
plt.scatter(x, y,c='teal',s=50)
# plot the smoothed line connecting the points
plt.plot(xnew, y_smooth,c='teal', linewidth=3)
# plot the original gcvi points from our dataset
plt.scatter(x_orig, y_orig,c='sienna',s=50)


# define the axes and title labels
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('GCVI', fontsize=12)
ax.set_title('Harmonic Fitted GCVI and Observed GCVI',fontsize=15)

# create dates from ordinal dates to use along x axis for easy interpretation
date_labels = [date.fromordinal(int(item)) for item in ax.get_xticks()]
ax.set_xticklabels(date_labels, rotation=45,fontsize=10)
ax.set_yticklabels(ax.get_yticks(),fontsize=10)

# remove bounding box from plot
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)

plt.show()

/tmp/ipykernel_6059/3867862512.py:37: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(date_labels, rotation=45,fontsize=10)
/tmp/ipykernel_6059/3867862512.py:38: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticks(),fontsize=10)


# Calculate mean elevation for area
gcvi_c = hrmregr_coefs_gcvi.reduceRegion(
    reducer = ee.Reducer.mean(),
    geometry = ee_plot_c,
    scale = 30
)
mean_gcvi_constant = gcvi_c.get('GCVI_constant')
print('Mean constant from GCVI harmonic regression for plot c:',mean_gcvi_constant.getInfo())

Mean constant from GCVI harmonic regression for plot c: 263.0954490816946


ee_plots_gdf = geemap.geopandas_to_ee(survey_gdf.loc[:,('unique_id','geometry')])
ee_plots_gdf.getInfo()

{'type': 'FeatureCollection',
 'columns': {'system:index': 'String', 'unique_id': 'Long'},
 'features': [{'type': 'Feature',
   'geometry': {'type': 'Polygon',
    'coordinates': [[[34.557479999973054, -14.305433999844611],
      [34.55742399997311, -14.304968999845014],
      [34.556232999973126, -14.305095999845069],
      [34.55665099997305, -14.305579999844612],
      [34.557479999973054, -14.305433999844611]]]},
   'id': '0',
   'properties': {'unique_id': -6.645496217314903e+18}},
  {'type': 'Feature',
   'geometry': {'type': 'Polygon',
    'coordinates': [[[33.93234000038297, -11.56669100270623],
      [33.931981000383054, -11.566139002706953],
      [33.93116000038293, -11.566817002706246],
      [33.9314710003829, -11.56712700270583],
      [33.93234000038297, -11.56669100270623]]]},
   'id': '1',
   'properties': {'unique_id': -8.821404992437737e+18}},
  {'type': 'Feature',
   'geometry': {'type': 'Polygon',
    'coordinates': [[[33.34090200009175, -13.601511000623603],
      [33.34000100009203, -13.599425000625658],
      [33.33809600009197, -13.600041000625328],
      [33.33913200009168, -13.602178000623226],
      [33.34090200009175, -13.601511000623603]]]},
   'id': '2',
   'properties': {'unique_id': 2.659752206514122e+18}}]}


# Calculate mean GCVI values for areas
ee_plots_gcvi = hrmregr_coefs_gcvi.reduceRegions(
    reducer = ee.Reducer.mean(),
    collection = ee_plots_gdf,
    scale = 30
)
# convert to DataFrame
plots_with_gcvi = geemap.ee_to_pandas(ee_plots_gcvi)
plots_with_gcvi


# define bands of interest
bands_indices = ['RDED4','GCVI','NBR1','NDTI','NDVI','SNDVI']

# create empty dataframe to fill with values
s2_2a_df = pd.DataFrame()

# for each band, run harmonic regression to obtain seasonality features
for band in bands_indices:
    dep_bands = [band]

    # select only band/index of interest
    imgcoll = s2coll_2a.select(dep_bands)
    # run harmonic regression on ImageCollection
    hrmregr_img = harmonics.run_std_regressions(imgcoll,dep_bands,None)

    # Use resulting image to obtain mean for each region of interest
    means = hrmregr_img.reduceRegions(
        reducer = ee.Reducer.mean(),
        collection = ee_plots_gdf,
        scale = 30
    )
    # convert to DataFrame
    means_df = geemap.ee_to_pandas(means)

    # if first band, define s2_2a_df as means_df
    if s2_2a_df.size == 0:
        s2_2a_df = means_df.copy()
    # else, join new bands onto s2_2a_df dataframe
    else:
        s2_2a_df = s2_2a_df.merge(means_df, on='unique_id')

s2_2a_df


s2_2a_df.columns

Index(['RDED4_sin2', 'RDED4_cos2', 'RDED4_sin1', 'RDED4_cos1', 'RDED4_t',
       'RDED4_constant', 'RDED4_variance', 'RDED4_count', 'RDED4_mean',
       'RDED4_rmse', 'RDED4_r2', 'unique_id', 'GCVI_sin2', 'GCVI_cos2',
       'GCVI_sin1', 'GCVI_cos1', 'GCVI_t', 'GCVI_constant', 'GCVI_variance',
       'GCVI_count', 'GCVI_mean', 'GCVI_rmse', 'GCVI_r2', 'NBR1_sin2',
       'NBR1_cos2', 'NBR1_sin1', 'NBR1_cos1', 'NBR1_t', 'NBR1_constant',
       'NBR1_variance', 'NBR1_count', 'NBR1_mean', 'NBR1_rmse', 'NBR1_r2',
       'NDTI_sin2', 'NDTI_cos2', 'NDTI_sin1', 'NDTI_cos1', 'NDTI_t',
       'NDTI_constant', 'NDTI_variance', 'NDTI_count', 'NDTI_mean',
       'NDTI_rmse', 'NDTI_r2', 'NDVI_sin2', 'NDVI_cos2', 'NDVI_sin1',
       'NDVI_cos1', 'NDVI_t', 'NDVI_constant', 'NDVI_variance', 'NDVI_count',
       'NDVI_mean', 'NDVI_rmse', 'NDVI_r2', 'SNDVI_sin2', 'SNDVI_cos2',
       'SNDVI_sin1', 'SNDVI_cos1', 'SNDVI_t', 'SNDVI_constant',
       'SNDVI_variance', 'SNDVI_count', 'SNDVI_mean', 'SNDVI_rmse',
       'SNDVI_r2'],
      dtype='object')


# define GEE id of SRTM V3 Image
srtm = ee.Image('USGS/SRTMGL1_003')


elevation = srtm.select('elevation')
slope = ee.Terrain.slope(elevation)
aspect = ee.Terrain.aspect(elevation)


# Calculate mean slope for area
slope_c = slope.reduceRegion(
    reducer = ee.Reducer.mean(),
    geometry = ee_plot_c,
    scale = 30
)
mean_slope_c = slope_c.get('slope')
print('Mean slope for plot c:',mean_slope_c.getInfo())

Mean slope for plot c: 2.4228431489192244


# this is not a real plot from any survey data; it was randomly selected for demonstration
center_lat = -13.600627
center_lon = 33.339620

elev_map = geemap.foliumap.Map(center=[center_lat,center_lon], zoom=9)
elev_map.addLayer(slope, {min:0, max: 90}, 'slope')
elev_map


# this is not a real plot from any survey data; it was randomly selected for demonstration
center_lat = -13.600627
center_lon = 33.339620


map_plot_c = geemap.foliumap.Map(center=[center_lat,center_lon],zoom=6)

terrain_params = {'min': 0, 'max': 1000, 'palette': [
    '3ae237', 'b5e22e', 'd6e21f', 'fff705', 'ffd611', 'ffb613', 'ff8b13',
    'ff6e08', 'ff500d', 'ff0000', 'de0101', 'c21301', '0602ff', '235cb1',
    '307ef3', '269db1', '30c8e2', '32d3ef', '3be285', '3ff38f', '86e26f'
  ]}
map_plot_c.addLayer(elevation, terrain_params, 'elevation')
map_plot_c


# define measures of interest
measure_images = {'elevation':elevation, 'slope':slope, 'aspect':aspect}
# create empty dataframe to store results
srtm_df = pd.DataFrame()

# for each measure, calculate the mean for all plots
for label, img in measure_images.items():
    means = img.reduceRegions(
        reducer = ee.Reducer.mean(),
        collection = ee_plots_gdf,
        scale = 30
    )
    # convert to DataFrame
    means_df = geemap.ee_to_pandas(means)
    # select only the mean and plot id to join
    # to the original; rename 'mean' for clarity
    means_df = means_df.loc[:,('unique_id','mean')].rename(columns={'mean':label})
    # if first terrain measure, define srtm_df as means_df
    if srtm_df.size == 0:
        srtm_df = means_df.copy()
    # else, join new terrain measures onto srtm_df DataFrame
    else:
        srtm_df = srtm_df.merge(means_df, on='unique_id')

srtm_df


# this is not a real plot from any survey data; it was randomly selected for demonstration
center_lat = -15.783006
center_lon = 35.437013

precip_point_g = geemap.foliumap.Map(center=[center_lat,center_lon],zoom=7)
precip_point_g.add_basemap('SATELLITE')

# select precipitation for the range of dates in the growing season
precip = ee.ImageCollection('UCSB-CHG/CHIRPS/DAILY').\
    filter(ee.Filter.date('2018-11-20', '2019-04-20')).select('precipitation')

precip_params = {'min': 1.0, 'max': 17.0,\
  'palette': ['001137', '0aab1e', 'e7eb05', 'ff4a2d', 'e90000'],}
precip_point_g.addLayer(precip, precip_params, 'Precipitation');
precip_point_g


# get the sum of all days (the first .reduce(ee.Reducer.sum()) which takes the sum of all Images in the ImageCollection), 
# then get the mean for each plot area with .reduceRegions()
tot_precip = precip.reduce(ee.Reducer.sum()).reduceRegions(
    reducer = ee.Reducer.mean(),
    collection = ee_plots_gdf,
    scale = 30
)
# convert to DataFrame
weather_df = geemap.ee_to_pandas(tot_precip)
weather_df.rename(columns={'mean':'precipitation'},inplace=True)
weather_df


# since all other tables have unique_id as a float dtype, convert survey_gdf's unique_id to float to match
survey_gdf['unique_id'] = survey_gdf['unique_id'].astype('float64')

merged = survey_gdf.merge(weather_df, on = 'unique_id')
merged = merged.merge(s2_2a_df, on = 'unique_id')    
merged = merged.merge(srtm_df, on = 'unique_id')   
merged


import rasterio
from rasterio.plot import show, plotting_extent
import earthpy as et
import earthpy.plot as ep 

cropland_file = './atlasai_crop_maize_raster_2019_sample/cropland_2019.tif'
maizeland_file = './atlasai_crop_maize_raster_2019_sample/maizeland_2019.tif'

cropland_raster = rasterio.open(cropland_file)
maizeland_raster = rasterio.open(maizeland_file)


maizeland_raster.read()

array([[[69, 72, 53, ..., 85, 85, 79],
        [63, 57, 52, ..., 85, 83, 83],
        [70, 57, 49, ..., 73, 85, 86],
        ...,
        [71, 73, 68, ..., 72, 76, 80],
        [74, 74, 64, ..., 68, 78, 85],
        [75, 72, 65, ..., 76, 76, 80]]], dtype=uint8)


# summary of the size of maizeland_raster contents
maizeland_raster.read().shape

(1, 2004, 2049)


maizeland_array = maizeland_raster.read(1)
maizeland_array

array([[69, 72, 53, ..., 85, 85, 79],
       [63, 57, 52, ..., 85, 83, 83],
       [70, 57, 49, ..., 73, 85, 86],
       ...,
       [71, 73, 68, ..., 72, 76, 80],
       [74, 74, 64, ..., 68, 78, 85],
       [75, 72, 65, ..., 76, 76, 80]], dtype=uint8)


f, (ax1,ax2) = plt.subplots(1,2,figsize=(15,8))
ep.plot_bands(cropland_raster.read(1),cmap='Greens',ax=ax1)
ax1.set_title('Cropland Raster')
ep.plot_bands(maizeland_raster.read(1),cmap='Wistia',ax=ax2)
ax2.set_title('Maizeland Raster')

Text(0.5, 1.0, 'Maizeland Raster')


import earthpy.mask as em  

f, (ax1,ax2) = plt.subplots(1,2,figsize=(15,8))

# create separate array of true/false values for pixels above/below 40 percent threshold
cropland_mask = cropland_raster.read(1) < 40
# use true/false array to remove all pixels which were below 40 percent
cropland_masked = em.mask_pixels(cropland_raster.read(1), cropland_mask)

# remove all pixels from maizeland raster which do not contain crops
maizeland_crop_masked = em.mask_pixels(maizeland_raster.read(1), cropland_mask)

# create array of remaining maizeland pixels as either above/below the 60 percent threshold
maizeland_mask = maizeland_crop_masked < 60
# remove all maizeland pixels which were below 60 percent
maizeland_masked = em.mask_pixels(maizeland_crop_masked, maizeland_mask)

# plot final rasters identifying cropland and maizeland predicted from models
ep.plot_bands(cropland_masked,cmap='Greens',ax=ax1,scale=True,vmin=0,vmax=100)
ep.plot_bands(maizeland_masked,cmap='Wistia',ax=ax2,scale=True,vmin=0,vmax=100)

<AxesSubplot:>

	crop_code_a	crop_code_b	crop_code_c	crop_code_d	crop_code_e	plot	unique_id	geometry	plot_area
0	1	34	12.0	NaN	NaN	plot_a	-6645496217314902902	POLYGON ((667960.790 8418253.597, 667871.248 8...	1.508917
1	12	38	NaN	NaN	28.0	plot_b	-8821404992437737508	POLYGON ((601630.450 8721518.054, 601535.535 8...	1.641266
2	1	28	38.0	42.0	NaN	plot_c	2659752206514122228	POLYGON ((536850.549 8496642.083, 536658.975 8...	13.306660

	crop_code_a	crop_code_b	crop_code_c	crop_code_d	crop_code_e	plot	unique_id	geometry	plot_area	maize_pos
0	1	34	12.0	NaN	NaN	plot_a	-6645496217314902902	POLYGON ((34.55748 -14.30543, 34.55665 -14.305...	1.508917	1
1	12	38	NaN	NaN	28.0	plot_b	-8821404992437737508	POLYGON ((33.93234 -11.56669, 33.93147 -11.567...	1.641266	0
2	1	28	38.0	42.0	NaN	plot_c	2659752206514122228	POLYGON ((33.34090 -13.60151, 33.33913 -13.602...	13.306660	1

	date	GCVI_HARMFIT	date_ordinal
0	2019-01-31	2.825936	737090
1	2019-02-10	2.658152	737100
2	2019-02-25	2.782984	737115
3	2019-03-12	3.200063	737130
4	2019-03-22	3.515674	737140

	GCVI_sin2	GCVI_cos2	GCVI_sin1	GCVI_cos1	GCVI_t	GCVI_constant	GCVI_variance	GCVI_count	GCVI_mean	GCVI_rmse	GCVI_r2	unique_id
0	5.988830	-54.051678	-267.402573	-303.735510	-1853.809982	678.805935	0.597549	8.246671	2.978578	0.111548	0.955892	-6.645496e+18
1	-1134.519602	-282.059117	-6342.417208	206.360703	-23693.844957	12383.461693	0.198886	11.478678	2.562344	0.112264	0.913124	-8.821405e+18
2	11.031942	-20.687845	-86.848001	-146.920897	-777.544449	263.095449	0.763646	34.418301	2.730780	0.608733	0.420253	2.659752e+18

	RDED4_sin2	RDED4_cos2	RDED4_sin1	RDED4_cos1	RDED4_t	RDED4_constant	RDED4_variance	RDED4_count	RDED4_mean	RDED4_rmse	...	SNDVI_cos2	SNDVI_sin1	SNDVI_cos1	SNDVI_t	SNDVI_constant	SNDVI_variance	SNDVI_count	SNDVI_mean	SNDVI_rmse	SNDVI_r2
0	-10.591325	-8.191404	-31.530930	90.364250	433.689077	-518.412639	0.003874	8.246671	0.358509	0.017484	...	-2.875033	-9.783135	39.243345	179.517173	-216.949060	0.005259	8.246671	0.507617	0.012290	0.957053
1	-120.576523	-221.995186	-956.855148	1547.635232	8243.978523	-9569.437735	0.005527	11.478678	0.324858	0.004904	...	-97.810958	-454.925323	-4.821486	670.055357	-479.777723	0.003972	11.478678	0.380890	0.007652	0.886615
2	-1.127877	-2.311148	-10.650291	17.013766	93.714036	-108.520916	0.004729	34.418301	0.286631	0.052480	...	-0.123783	0.180259	3.434490	12.999024	-16.113102	0.006167	34.418301	0.428591	0.057037	0.404706

ABSTRACT¶

CONTENTS¶

OVERVIEW¶

Crop Type Mapping Overview¶

Geospatial Data and Tools¶

Data Structures¶

GIS in Python and Google Earth Engine for Earth Observation Data¶

INTRODUCTION TO DATA¶

Survey Data¶

Earth Observation (EO) Data Sets¶

Sentinel-2 (S2) Optical Imagery¶

Additional Earth Observation Data Sources¶

Topography Features: Elevation, Slope, and Aspect¶

Weather Features: Average Temperature, Growing Degree Days (GDD), and Total Precipitation¶

DEVELOPING CROP TYPE MAPPING MODEL¶

Data Ingestion and Preprocessing¶

Survey Data¶

Visualizing Geographic Areas¶

Calculating Plot Area¶

EO Data¶

Sentinel-2 Level-2A Imagery¶

Index Time Series and Crop Phenology¶

OPTIONAL: Plot Smoothed Index Time Series¶

Obtain Harmonic Coefficients¶

Shuttle Radar Topography Mission Data¶

Climate Data¶

Generate Composites¶

Excluding Plot Observations¶

Sample Size¶

Feature Pre-Selection¶

Train and Tune Model¶

Train and Test Subsets¶

Hyperparameter Tuning¶

GENERATE PREDICTED CULTIVATION MAPS¶

Apply Trained Model to Full Area of Interest¶

Generate Predicted Cultivation Maps¶

References¶

unique_id	crop_code_a	crop_code_b	crop_code_c	crop_code_d	plot_area	plot_perimeter
-6645496217314902902	1	34	12		1.51	geom_poly
-8821404992437737508	12	38			1.64	geom_poly
2659752206514122228	1	28	38	42	13.3	geom_poly

	datetime	GCVI	band_id
0	2019-01-31 07:54:27	2.834971	20190131T073139_20190131T074730_T36LYH_GCVI
1	2019-02-10 07:54:27	2.476578	20190210T073039_20190210T074611_T36LYH_GCVI
2	2019-02-25 07:54:22	3.261002	20190225T072901_20190225T075255_T36LYH_GCVI
3	2019-03-12 07:54:24	3.236196	20190312T073019_20190312T074735_T36LYH_GCVI
4	2019-03-22 07:54:27	2.745631	20190322T072619_20190322T074833_T36LYH_GCVI

	unique_id	elevation	slope	aspect
0	-6.645496e+18	522.309789	2.920060	98.295870
1	-8.821405e+18	1303.766563	11.103849	80.482079
2	2.659752e+18	1050.255761	2.422843	131.906412

	precipitation	unique_id
0	1091.281834	-6.645496e+18
1	1424.429921	-8.821405e+18
2	1277.654234	2.659752e+18