api_key = 'PASTE YOUR PLATFORM API KEY HERE'
location = 'PASTE YOUR INSTANCE LOCATION HERE'


wml_credentials = {
    "apikey": api_key,
    "url": 'https://' + location + '.ml.cloud.ibm.com'
}


!rm -rf /home/spark/shared/user-libs/python3.10*


!pip install -U --user ibm-watson-machine-learning


from ibm_watson_machine_learning import APIClient

client = APIClient(wml_credentials)


space_id = 'PASTE YOUR SPACE ID HERE'


client.spaces.list(limit=10)


client.set.default_space(space_id)


try:
    from pyspark.sql import SparkSession
except:
    print('Error: Spark runtime is missing. If you are using Watson Studio change the notebook runtime to Spark.')
    raise


!pip install wget


import os
from wget import download

sample_dir = 'spark_sample_model'
if not os.path.isdir(sample_dir):
    os.mkdir(sample_dir)
    
filename = os.path.join(sample_dir, 'GoSales_Tx.csv')
if not os.path.isfile(filename):
    filename = download('https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/data/product-line-prediction/GoSales_Tx.csv', out=sample_dir)


spark = SparkSession.builder.getOrCreate()

df_data = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load(filename)
df_data.take(3)


df_data.printSchema()


df_data.show()


df_data.count()


splitted_data = df_data.randomSplit([0.8, 0.18, 0.02], 24)
train_data = splitted_data[0]
test_data = splitted_data[1]
predict_data = splitted_data[2]

print("Number of training records: " + str(train_data.count()))
print("Number of testing records : " + str(test_data.count()))
print("Number of prediction records : " + str(predict_data.count()))


from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model


stringIndexer_label = StringIndexer(inputCol="PRODUCT_LINE", outputCol="label").fit(df_data)
stringIndexer_prof = StringIndexer(inputCol="PROFESSION", outputCol="PROFESSION_IX")
stringIndexer_gend = StringIndexer(inputCol="GENDER", outputCol="GENDER_IX")
stringIndexer_mar = StringIndexer(inputCol="MARITAL_STATUS", outputCol="MARITAL_STATUS_IX")


vectorAssembler_features = VectorAssembler(inputCols=["GENDER_IX", "AGE", "MARITAL_STATUS_IX", "PROFESSION_IX"], outputCol="features")


rf = RandomForestClassifier(labelCol="label", featuresCol="features")


labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=stringIndexer_label.labels)


pipeline_rf = Pipeline(stages=[stringIndexer_label, stringIndexer_prof, stringIndexer_gend, stringIndexer_mar, vectorAssembler_features, rf, labelConverter])


train_data.printSchema()


model_rf = pipeline_rf.fit(train_data)


predictions = model_rf.transform(test_data)
evaluatorRF = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))


import ibm_boto3
from ibm_botocore.client import Config


cos_credentials = {
PASTE YOUR COS CREDENTIALS HERE
}

# example:
# cos_credentials = {
#   "apikey": "***",
#   "cos_hmac_keys": {
#     "access_key_id": "***",
#     "secret_access_key": "***"
#   },
#   "endpoints": "https://cos-service.bluemix.net/endpoints",
#   "iam_apikey_description": "***",
#   "iam_apikey_name": "***",
#   "iam_role_crn": "crn:v1:bluemix:public:iam::::serviceRole:Writer",
#   "iam_serviceid_crn": "***",
#   "resource_instance_id": "***"
# }


connection_apikey = cos_credentials['apikey']
connection_resource_instance_id = cos_credentials["resource_instance_id"]
connection_access_key_id = cos_credentials['cos_hmac_keys']['access_key_id']
connection_secret_access_key = cos_credentials['cos_hmac_keys']['secret_access_key']


service_endpoint = 'https://s3.us.cloud-object-storage.appdomain.cloud'


auth_endpoint = 'https://iam.cloud.ibm.com/identity/token'


cos = ibm_boto3.resource('s3',
                         ibm_api_key_id=cos_credentials['apikey'],
                         ibm_service_instance_id=cos_credentials['resource_instance_id'],
                         ibm_auth_endpoint=auth_endpoint,
                         config=Config(signature_version='oauth'),
                         endpoint_url=service_endpoint)


from uuid import uuid4

bucket_uid = str(uuid4())

score_filename = "GoSales_Tx.csv.csv"
buckets = ["product-line-" + bucket_uid]


for bucket in buckets:
    if not cos.Bucket(bucket) in cos.buckets.all():
        print('Creating bucket "{}"...'.format(bucket))
        try:
            cos.create_bucket(Bucket=bucket)
        except ibm_boto3.exceptions.ibm_botocore.client.ClientError as e:
            print('Error: {}.'.format(e.response['Error']['Message']))


bucket_obj = cos.Bucket(buckets[0])

print('Uploading data {}...'.format(score_filename))
with open(filename, 'rb') as f:
    bucket_obj.upload_fileobj(f, score_filename)
print('{} is uploaded.'.format(score_filename))


datasource_type = client.connections.get_datasource_type_uid_by_name('bluemixcloudobjectstorage')

conn_meta_props= {
    client.connections.ConfigurationMetaNames.NAME: "COS connection - spark",
    client.connections.ConfigurationMetaNames.DATASOURCE_TYPE: datasource_type,
    client.connections.ConfigurationMetaNames.PROPERTIES: {
        'bucket': buckets[0],
        'access_key': connection_access_key_id,
        'secret_key': connection_secret_access_key,
        'iam_url': auth_endpoint,
        'url': service_endpoint
    }
}

conn_details = client.connections.create(meta_props=conn_meta_props)


connection_id = client.connections.get_uid(conn_details)


training_data_references = [
                {
                    "id": "product line",
                    "type": "connection_asset",
                    "connection": {
                        "id": connection_id
                    },
                    "location": {
                        "bucket": buckets[0],
                        "file_name": score_filename,
                    }
                }
            ]


saved_model = client.repository.store_model(
    model=model_rf, 
    meta_props={
        client.repository.ModelMetaNames.NAME:'Product Line model',
        client.repository.ModelMetaNames.TYPE: "mllib_3.3",
        client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: client.software_specifications.get_id_by_name('spark-mllib_3.3'),
        client.repository.ModelMetaNames.TRAINING_DATA_REFERENCES: training_data_references,
        client.repository.ModelMetaNames.LABEL_FIELD: "PRODUCT_LINE",
    },  
    training_data=train_data, 
    pipeline=pipeline_rf)


published_model_id = client.repository.get_model_id(saved_model)

print("Model Id: " + str(published_model_id))


client.repository.get_model_details(published_model_id)


loaded_model = client.repository.load(published_model_id)


print(type(loaded_model))


predictions = loaded_model.transform(predict_data)


predictions.show(5, vertical=True)


predictions.select("predictedLabel").groupBy("predictedLabel").count().show()


deployment_details = client.deployments.create(
    published_model_id, 
    meta_props={
        client.deployments.ConfigurationMetaNames.NAME: "Product Line model deployment",
        client.deployments.ConfigurationMetaNames.ONLINE: {}
    }
)


deployment_details

Use Spark to predict product line with `ibm-watson-machine-learning`¶

Learning goals¶

Contents¶

1. Set up the environment¶

Connection to WML¶

Install and import the `ibm-watson-machine-learning` package¶

Working with spaces¶

2. Load and explore data¶

Test Spark¶

3. Create an Apache® Spark machine learning model¶

3.1: Prepare data¶

3.2: Create pipeline and train a model¶

4. Persist model¶

Save training data in your Cloud Object Storage¶

Create connections to a COS bucket¶

4.1: Save pipeline and model¶

4.2: Load model¶

5. Predict locally¶

5.1: Make local prediction using previously loaded model and test data¶

6. Deploy and score in a Cloud¶

6.1: Create online scoring endpoint¶

Create online deployment for published model¶

7. Clean up¶

8. Summary and next steps¶

Authors¶

Use Spark to predict product line with ibm-watson-machine-learning¶

Learning goals¶

Contents¶

1. Set up the environment¶

Connection to WML¶

Install and import the ibm-watson-machine-learning package¶

Working with spaces¶

2. Load and explore data¶

Test Spark¶

3. Create an Apache® Spark machine learning model¶

3.1: Prepare data¶

3.2: Create pipeline and train a model¶

4. Persist model¶

Save training data in your Cloud Object Storage¶

Create connections to a COS bucket¶

4.1: Save pipeline and model¶

4.2: Load model¶

5. Predict locally¶

5.1: Make local prediction using previously loaded model and test data¶

6. Deploy and score in a Cloud¶

6.1: Create online scoring endpoint¶

Create online deployment for published model¶

7. Clean up¶

8. Summary and next steps¶

Authors¶

Use Spark to predict product line with `ibm-watson-machine-learning`¶

Install and import the `ibm-watson-machine-learning` package¶