api_key = 'PASTE YOUR PLATFORM API KEY HERE'
location = 'PASTE YOUR INSTANCE LOCATION HERE'


wml_credentials = {
    "apikey": api_key,
    "url": 'https://' + location + '.ml.cloud.ibm.com'
}


!pip install ibm-watson-machine-learning


from ibm_watson_machine_learning import APIClient

client = APIClient(wml_credentials)


space_id = 'PASTE YOUR SPACE ID HERE'


client.spaces.list(limit=10)


client.set.default_space(space_id)


try:
    from pyspark.sql import SparkSession
except:
    print('Error: Spark runtime is missing. If you are using Watson Studio change the notebook runtime to Spark.')
    raise


! pip install wget


import os
from wget import download

sample_dir = 'spark_sample_model'
if not os.path.isdir(sample_dir):
    os.mkdir(sample_dir)
    
filename = os.path.join(sample_dir, 'car_rental_training_data.csv')
if not os.path.isfile(filename):
    filename = download('https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/data/cars-4-you/car_rental_training_data.csv', out=sample_dir)


spark = SparkSession.builder.getOrCreate()


df_data = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .option("delimiter", ";")\
  .load(filename)
df_data.take(3)


df_data.printSchema()


print("Number of records: " + str(df_data.count()))


df_data.select('Business_Area').groupBy('Business_Area').count().show(truncate=False)


train_data, test_data = df_data.select("ID", "Customer_Service", "Business_Area").randomSplit([0.8, 0.2], 24)

print("Number of training records: " + str(train_data.count()))
print("Number of testing records : " + str(test_data.count()))


from pyspark.ml.feature import StringIndexer, IndexToString, HashingTF, IDF, Tokenizer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model
from pyspark.sql.types import *


tokenizer = Tokenizer(inputCol="Customer_Service", outputCol="words")
hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='hash')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="features", minDocFreq=5)


string_indexer_label = StringIndexer(inputCol="Business_Area", outputCol="label").fit(train_data)


dt_area = DecisionTreeClassifier(labelCol="label", featuresCol=idf.getOutputCol())


label_converter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=string_indexer_label.labels)


pipeline = Pipeline(stages=[tokenizer, hashing_tf, idf, string_indexer_label, dt_area, label_converter])


model = pipeline.fit(train_data)


predictions = model.transform(test_data)
predictions.select('Customer_Service','Business_Area','predictedLabel').show(3)


predictions.printSchema()


evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print("Accuracy = %3.2f" % accuracy)


import ibm_boto3
from ibm_botocore.client import Config


cos_credentials = {
                  "apikey": "***",
                  "cos_hmac_keys": {
                    "access_key_id": "***",
                    "secret_access_key": "***"
                  },
                  "endpoints": "***",
                  "iam_apikey_description": "***",
                  "iam_apikey_name": "***",
                  "iam_role_crn": "***",
                  "iam_serviceid_crn": "***",
                  "resource_instance_id": "***"
                }


connection_apikey = cos_credentials['apikey']
connection_resource_instance_id = cos_credentials["resource_instance_id"]
connection_access_key_id = cos_credentials['cos_hmac_keys']['access_key_id']
connection_secret_access_key = cos_credentials['cos_hmac_keys']['secret_access_key']


service_endpoint = 'https://s3.us.cloud-object-storage.appdomain.cloud'


auth_endpoint = 'https://iam.cloud.ibm.com/identity/token'


cos = ibm_boto3.resource('s3',
                         ibm_api_key_id=cos_credentials['apikey'],
                         ibm_service_instance_id=cos_credentials['resource_instance_id'],
                         ibm_auth_endpoint=auth_endpoint,
                         config=Config(signature_version='oauth'),
                         endpoint_url=service_endpoint)


from uuid import uuid4

bucket_uid = str(uuid4())

score_filename = "car_rental_training_data.csv"
buckets = ["car-rental-" + bucket_uid]


for bucket in buckets:
    if not cos.Bucket(bucket) in cos.buckets.all():
        print('Creating bucket "{}"...'.format(bucket))
        try:
            cos.create_bucket(Bucket=bucket)
        except ibm_boto3.exceptions.ibm_botocore.client.ClientError as e:
            print('Error: {}.'.format(e.response['Error']['Message']))


bucket_obj = cos.Bucket(buckets[0])

print('Uploading data {}...'.format(score_filename))
with open(filename, 'rb') as f:
    bucket_obj.upload_fileobj(f, score_filename)
print('{} is uploaded.'.format(score_filename))


datasource_type = client.connections.get_datasource_type_uid_by_name('bluemixcloudobjectstorage')

conn_meta_props= {
    client.connections.ConfigurationMetaNames.NAME: "COS connection - spark",
    client.connections.ConfigurationMetaNames.DATASOURCE_TYPE: datasource_type,
    client.connections.ConfigurationMetaNames.PROPERTIES: {
        'bucket': buckets[0],
        'access_key': connection_access_key_id,
        'secret_key': connection_secret_access_key,
        'iam_url': auth_endpoint,
        'url': service_endpoint
    }
}

conn_details = client.connections.create(meta_props=conn_meta_props)


connection_id = client.connections.get_uid(conn_details)


training_data_references = [
                {
                    "id":"car-rental-training",
                    "type": "connection_asset",
                    "connection": {
                        "id": connection_id
                    },
                    "location": {
                        "bucket": buckets[0],
                        "file_name": score_filename,
                    }
                }
            ]


saved_model = client.repository.store_model(
    model=model, 
    meta_props={
        client.repository.ModelMetaNames.NAME:"CARS4U - Business Area Prediction Modeljj",
        client.repository.ModelMetaNames.TYPE: "mllib_3.3",
        client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: client.software_specifications.get_id_by_name('spark-mllib_3.3'),
        client.repository.ModelMetaNames.TRAINING_DATA_REFERENCES: training_data_references,
        client.repository.ModelMetaNames.LABEL_FIELD: "Business_Area",
    },  
    training_data=train_data, 
    pipeline=pipeline)


published_model_id = client.repository.get_model_id(saved_model)

print("Model Id: " + str(published_model_id))


client.repository.get_model_details(published_model_id)


deployment_details = client.deployments.create(
    published_model_id, 
    meta_props={
        client.deployments.ConfigurationMetaNames.NAME: "CARS4U - Business Area Prediction Model deployment",
        client.deployments.ConfigurationMetaNames.ONLINE: {}
    }
)


deployment_details


fields = ['ID', 'Gender', 'Status', 'Children', 'Age', 'Customer_Status','Car_Owner', 'Customer_Service', 'Business_Area', 'Satisfaction']
values = [3785, 'Male', 'S', 1, 17, 'Inactive', 'Yes', 'The car should have been brought to us instead of us trying to find it in the lot.', 'Product: Information', 0]


import json

payload_scoring = {"input_data": [{"fields": fields,"values": [values]}]}
scoring_response = client.deployments.score(client.deployments.get_id(deployment_details), payload_scoring)

print(json.dumps(scoring_response, indent=3))

Use Spark to predict business area for car rental company with `ibm-watson-machine-learning`¶

Learning goals¶

Contents¶

1. Set up the environment¶

Connection to WML¶

Install and import the `ibm-watson-machine-learning` package¶

Working with spaces¶

Test Spark¶

2. Load and explore data¶

Explore data¶

3. Create an Apache Spark machine learning model¶

3.1 Prepare data for model training and evaluation¶

3.2 Create the pipeline ¶

3.3 Train the model ¶

4. Persist model¶

Save training data in your Cloud Object Storage¶

Create connections to a COS bucket¶

4.2 Save the pipeline and model ¶

5. Deploy model in IBM Cloud¶

6. Score¶

7. Clean up¶

8. Summary and next steps¶

Authors¶

Use Spark to predict business area for car rental company with ibm-watson-machine-learning¶

Learning goals¶

Contents¶

1. Set up the environment¶

Connection to WML¶

Install and import the ibm-watson-machine-learning package¶

Working with spaces¶

Test Spark¶

2. Load and explore data¶

Explore data¶

3. Create an Apache Spark machine learning model¶

3.1 Prepare data for model training and evaluation¶

3.2 Create the pipeline¶

3.3 Train the model¶

4. Persist model¶

Save training data in your Cloud Object Storage¶

Create connections to a COS bucket¶

4.2 Save the pipeline and model¶

5. Deploy model in IBM Cloud¶

6. Score¶

7. Clean up¶

8. Summary and next steps¶

Authors¶

Use Spark to predict business area for car rental company with `ibm-watson-machine-learning`¶

Install and import the `ibm-watson-machine-learning` package¶

3.2 Create the pipeline ¶

3.3 Train the model ¶

4.2 Save the pipeline and model ¶