spark._jvm.org.apache.spark.sql.types.SqlGeometry.registerAll(spark._jsparkSession)


import pandas as pd
from pyspark.sql.types import *


from urllib.request import Request, urlopen
    
req = Request('https://api.dataplatform.cloud.ibm.com/v2/gallery-assets/entries/5562ced564e776edc5f91e13d48d8309/data?accessKey=466875ad0187d4ea757478e5c1130b59')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)

hospital_pdf = pd.read_csv(content)
print(hospital_pdf)


hospital_schema = StructType([StructField('id', IntegerType()),
                              StructField('name', StringType()),
                              StructField('city', StringType()),
                              StructField('state', StringType()),
                              StructField('lon', DoubleType()),
                              StructField('lat', DoubleType())])


hospital_df = spark.createDataFrame(hospital_pdf, hospital_schema)


hospital_df.show(3)


from urllib.request import Request, urlopen  # Python 3
    
req = Request('https://api.dataplatform.cloud.ibm.com/v2/gallery-assets/entries/c8cc28f4c30dc4d8c0b13f18c50c3244/data?accessKey=c8cc28f4c30dc4d8c0b13f18c50fa2d5')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)

counties_pdf = pd.read_csv(content)[['NAME', 'STATE_NAME', 'POP2000', 'shape_WKT']]
print(counties_pdf)


counties_schema = StructType([StructField('NAME', StringType()),
                              StructField('STATE_NAME', StringType()),
                              StructField('POP2000', IntegerType()),
                              StructField('shape_WKT', StringType())])


counties_df = spark.createDataFrame(counties_pdf, counties_schema)


counties_df.show(3)


hospital_df.createOrReplaceTempView("hospitals")
hospital_df = spark.sql("SELECT *, ST_Point(lon, lat) as location from hospitals")
hospital_df.show(3, False)


counties_df.createOrReplaceTempView('counties')
counties_df = spark.sql("SELECT NAME, STATE_NAME, POP2000, ST_WKTToSQL(shape_WKT) as shape from counties")
counties_df.show(3)


#spark.sql.legacy.storeAnalyzedPlanForView = True

#SparkSession.sql("set spark.sql.legacy.storeAnalyzedPlanForView = true")


spark.conf.set("spark.sql.legacy.storeAnalyzedPlanForView","False")

print(spark.conf.get("spark.sql.legacy.storeAnalyzedPlanForView"))


hospital_df.createOrReplaceTempView('hospitals_temp')
counties_df.createOrReplaceTempView('counties_temp')


spark.sql("""
SELECT name, city, state
FROM hospitals_temp
WHERE ST_Distance(location, ST_Point(-77.574722, 43.146732)) < 10000.0
""").show()


spark.sql("""
SELECT NAME 
FROM counties_temp 
WHERE ST_Contains(shape, ST_Point(-74.237, 42.037))
""").show()


spark.sql("""
SELECT NAME
FROM counties_temp
WHERE
ST_Within(ST_Point(-74.237, 42.037), shape)
""").show()


spark.sql("""
SELECT NAME
FROM counties_temp
WHERE
ST_Intersects(shape, ST_Point(-74.237, 42.037))
""").show()


spark.sql("""
SELECT name
FROM hospitals_temp
WHERE
ST_Contains(ST_WKTToSQL('POLYGON ((-74.0 42.0, -73.0 42.0, -73.0 43.0, -74.0 43.0, -74.0 42.0))'), location)
""").show(3)


spark.sql("""
SELECT name 
FROM hospitals_temp
WHERE ST_Within(location, ST_WKTToSQL('POLYGON ((-74.0 42.0, -73.0 42.0, -73.0 43.0, -74.0 43.0, -74.0 42.0))'))
""").show(3)


spark.sql("""
SELECT name 
FROM hospitals_temp
WHERE ST_Intersects(location, ST_WKTToSQL('POLYGON ((-74.0 42.0, -73.0 42.0, -73.0 43.0, -74.0 43.0, -74.0 42.0))'))
""").show(3)


spark.sql("""
SELECT c.NAME, h.name 
FROM counties_temp AS c, hospitals_temp AS h 
WHERE c.NAME = 'Dutchess' 
AND ST_Intersects(c.shape, h.location)
""").show()


spark.sql("""
SELECT h.name, c.NAME
FROM counties_temp AS c
JOIN hospitals_temp AS h
ON c.NAME = 'Dutchess'
AND ST_Intersects(h.location, c.shape)
""").show()


spark.sql("""
SELECT c.NAME, h.name
FROM hospitals_temp AS h, counties_temp AS c
WHERE ST_Intersects(h.location, c.shape)
AND h.name = 'Vassar Brothers Hospital'
""").show()


spark.sql("""
SELECT c.NAME, h.name
FROM counties_temp AS c, hospitals_temp AS h
WHERE ST_Intersects(h.location, c.shape)
AND c.STATE_NAME='New York'
ORDER BY c.NAME, h.name
""").show(3)


spark.sql("""
SELECT c.NAME, h.name
FROM hospitals_temp AS h, counties_temp AS c
WHERE ST_Intersects(h.location, c.shape)
AND h.state='NY'
ORDER BY c.NAME, h.name
""").show(3)


spark.sql("""
SELECT c.NAME, COUNT(h.name) AS hospital_count
FROM counties_temp AS c, hospitals_temp AS h
WHERE ST_Intersects(h.location, c.shape)
AND c.STATE_NAME='New York'
GROUP BY c.NAME
""").show(3)


spark.sql("""
SELECT c.NAME, 
COUNT(h.name) AS hospital_count, 
c.POP2000 AS Population, 
c.POP2000/COUNT(h.name) AS people_per_hospital
FROM counties_temp AS c, hospitals_temp AS h
WHERE c.STATE_NAME='New York'
AND ST_Intersects(h.location, c.shape)
GROUP BY c.NAME, c.POP2000
ORDER BY people_per_hospital DESC
""").show(3)


spark.sql("""
SELECT name
FROM hospitals_temp
WHERE ST_Intersects(location, ST_WKTToSQL(
 'POLYGON ((-74.0 42.0, -73.0 42.0, -73.0 43.0, -74.0 43.0, -74.0 42.0))'))
""").show(3)


spark.sql("""
SELECT name
FROM hospitals_temp
WHERE ST_Intersects(location, ST_WKTToSQL(
 'POLYGON ((-74.0 42.0, -73.0 42.0, -73.0 43.0, -74.0 43.0, -74.0 42.0))'))
""").show(3)


spark.sql("""
SELECT name
FROM hospitals_temp
WHERE
ST_Intersects(location,
  ST_Buffer(ST_Point(-74.237, 42.037), 46800.0))
ORDER BY name
""").show(3)


spark.sql("""
SELECT name, ST_Distance(location, ST_Point(-74.237, 42.037)) AS distance
FROM hospitals_temp
WHERE ST_Distance(location, ST_Point(-74.237, 42.037)) < 46800.0
ORDER BY distance
""").show(3)


spark.sql("""
SELECT name, ST_Distance(location, ST_Point(-74.237, 42.037)) AS distance
FROM hospitals_temp
WHERE
  ST_Intersects(location,
  ST_Buffer(ST_Point(-74.237, 42.037), 46800.0))
ORDER BY distance
""").show(3)


spark.sql("""
SELECT name, ST_Distance(location, ST_WKTToSQL(
 'LINESTRING (-74.0 42.0, -73.0 42.0)'))
FROM hospitals_temp
WHERE ST_Intersects(location, ST_Buffer(ST_WKTToSQL(
 'LINESTRING (-74.0 42.0, -73.0 42.0)'), 46800.0))
""").show(3)

Spatial Queries in PySpark

Table of Contents¶

1. Register the Spark SQL spatial functions¶

2. Get sample data¶

3. Create a geometry column for hospital and county data¶

4. Register the hospital and county data frames as a temporary view¶

5. Run spatial queries¶

Example 1: Query to determine points closest to another point¶

Example 2: Queries to determine which polygon contains a point¶

Example 3: Queries to determine the points in a polygon¶

Example 4: Spatial join queries to determine points in a polygon¶

Example 5: Spatial join queries with additional predicates and aggregation¶

Example 6: Window queries¶

Example 7: Distance queries¶

Summary¶

Author¶