I found this site and this site both really helpful for actually implementing the Canny Edge Detetor. Definitely refer to both of them for more examples and pictures.
I did use Python/NumPy for generating the Gaussian Filter kernel files that were needed for the ArcGIS Focal Statistics function. The following code will generate a properly formatted kernel file based on the kernel size and sigma value.
import os
import numpy as np
def gaussian_kernel(workspace, size=5, sigma=1.4):
    sigma_str = str(round(sigma,1)).replace('.', 'p')
    file_name = 'gaussian_{0}_{1}.txt'.format(size, sigma_str)
    size = int(size)
    i,j = np.mgrid[0:size,0:size]
    i -= (size/2)
    j -= (size/2)
    g = np.exp(-(i**2+j**2)/(2*sigma**2))
    g /= g.sum()
    
    f = open(os.path.join(workspace, file_name), 'wb')
    f.write('{0} {0}\n'.format(size))
    for row in g:
        print '  {0}'.format(' '.join(['{0:0.6f}'.format(x) for x in row]))
        f.write('{0}\n'.format(' '.join(['{0:0.6f}'.format(x) for x in row])))
    f.close()
I then had to build kernel files for the Sobel Operators.| sobel_x.txt | sobel_y.txt | 
|---|---|
3 3 | 
3 3 | 
| d0.txt | d1.txt | d2.txt | d3.txt | 
|---|---|---|---|
3 3 | 
3 3 | 
3 3 | 
3 3 | 
temp_obj.save(temp_raster) calls.  I didn't include all of my input checking and some names/values are hardwired.import os
import arcpy
from arcpy import env
import arcpy.sa as sa
import numpy as np
def canny_edge(workspace, raster_name, filter_name, t_low, t_high):
    #### Set ArcGIS environment parameters
    arcpy.CheckOutExtension("Spatial")
    env.overwriteOutput = True
    env.workspace = workspace
    env.scratchWorkspace = workspace
    env.cellSize = raster_path
    env.extent = raster_path
    env.snapRaster = raster_path
    #### Hardwired file names for kernels
    sobel_x_path = os.path.join(workspace, 'sobel_x.txt')
    sobel_y_path = os.path.join(workspace, 'sobel_y.txt')
    d0_path = os.path.join(workspace, 'd0.txt')
    d1_path = os.path.join(workspace, 'd1.txt')
    d2_path = os.path.join(workspace, 'd2.txt')
    d3_path = os.path.join(workspace, 'd3.txt')
    d4_path = os.path.join(workspace, 'd4.txt')
    d5_path = os.path.join(workspace, 'd5.txt')
    d6_path = os.path.join(workspace, 'd6.txt')
    d7_path = os.path.join(workspace, 'd7.txt')
    #### Canny Edge Detector Workflow
    #### Apply gaussian filter
    filter_obj = sa.FocalStatistics(
        raster_path, sa.NbrWeight(filter_path), "SUM", "NODATA")
    #### Calculate 1st derivative in horizontal and vertical direction
    grad_x_obj = sa.FocalStatistics(
        filter_obj, sa.NbrWeight(sobel_x_path), "SUM", "DATA")
    grad_y_obj = sa.FocalStatistics(
        filter_obj, sa.NbrWeight(sobel_y_path), "SUM", "DATA")
    del filter_obj
    #### Calculate gradient and direction
    grad_obj = sa.SquareRoot(
        sa.Power(grad_x_obj,2) + sa.Power(grad_y_obj,2))
    angle_obj = sa.ATan2(grad_y_obj, grad_x_obj)
    del grad_x_obj, grad_y_obj
    #### Reclassify angle into four directions
    #### See http://suraj.lums.edu.pk/~cs436a02/CannyImplementation.htm
    reclass_remap = sa.RemapRange(
        [[-3.141593, -2.748894, 0], [-2.748894, -1.963495, 1],
         [-1.963495, -1.178097, 2], [-1.178097, -0.392699, 3],
         [-0.392699,  0.392699, 0], [ 0.392699,  1.178097, 1],
         [ 1.178097,  1.963495, 2], [ 1.963495,  2.748894, 3],
         [ 2.748894,  3.141593, 0]])
    zone_obj = sa.Reclassify(
        angle_obj, "Value", reclass_remap, "NODATA")
    del angle_obj, reclass_remap
    #### Non-maxima suppresion
    #### Shift raster 1 cell in each of the 8 possible directions
    #### This will get neighboring cells for non-maxima suppression
    #### Keep cells that are greater than the two neighboring cells
    ####  along the gradient direction
    #### Calculate separate raster for each primary direction then sum
    d0_obj = sa.FocalStatistics(
        grad_obj, sa.NbrWeight(d0_path), "SUM", "DATA")
    d4_obj = sa.FocalStatistics(
        grad_obj, sa.NbrWeight(d4_path), "SUM", "DATA")
    grad_04_obj = sa.Con(
        (zone_obj == 0) & (grad_obj > d0_obj) & (grad_obj > d4_obj),
        grad_obj, 0)
    del d0_obj, d4_obj
    d1_obj = sa.FocalStatistics(
        grad_obj, sa.NbrWeight(d1_path), "SUM", "DATA")
    d5_obj = sa.FocalStatistics(
        grad_obj, sa.NbrWeight(d5_path), "SUM", "DATA")
    grad_15_obj = sa.Con(
        (zone_obj == 1) & (grad_obj > d1_obj) & (grad_obj > d5_obj),
        grad_obj, 0)
    del d1_obj, d5_obj
    d2_obj = sa.FocalStatistics(
        grad_obj, sa.NbrWeight(d2_path), "SUM", "DATA")
    d6_obj = sa.FocalStatistics(
        grad_obj, sa.NbrWeight(d6_path), "SUM", "DATA")
    grad_26_obj = sa.Con(
        (zone_obj == 2) & (grad_obj > d2_obj) & (grad_obj > d6_obj),
        grad_obj, 0)
    del d2_obj, d6_obj
    d3_obj = sa.FocalStatistics(
        grad_obj, sa.NbrWeight(d3_path), "SUM", "DATA")
    d7_obj = sa.FocalStatistics(
        grad_obj, sa.NbrWeight(d7_path), "SUM", "DATA")
    grad_37_obj = sa.Con(
        (zone_obj == 3) & (grad_obj > d3_obj) & (grad_obj > d7_obj),
        grad_obj, 0)
    del d3_obj, d7_obj
    del grad_obj, zone_obj
    print "Sum Directions"
    nms_obj = (grad_04_obj + grad_15_obj + grad_26_obj + grad_37_obj)
    del grad_04_obj, grad_15_obj, grad_26_obj, grad_37_obj
    nms_obj.save(raster_path.replace('.img', '_nms.img'))
    
    #### Hysteresis Thresholding
    t_low_obj = sa.RegionGroup(
        sa.Con(nms_obj > t_low, 1), "EIGHT", "CROSS", "NO_LINK")
    t_zs_obj = sa.ZonalStatistics(
        t_low_obj, "value", nms_obj, "MAXIMUM", "DATA")
    del t_low_obj
    t_high_obj = sa.Con((t_zs_obj >= t_high), nms_obj)
    del t_zs_obj, nms_obj
    t_high_obj.save(raster_path.replace('.img', '_edges.img'))
There are probably issues at the edges of the rasters because I noticed that the Focal Statistics was not properly handling nodata using the weighting option.