api_key = 'PASTE YOUR PLATFORM API KEY HERE'
location = 'PASTE YOUR INSTANCE LOCATION HERE'


wml_credentials = {
    "apikey": api_key,
    "url": 'https://' + location + '.ml.cloud.ibm.com'
}


!rm -rf /home/spark/shared/user-libs/python3.10*


!pip install -U --user ibm-watson-machine-learning


from ibm_watson_machine_learning import APIClient

client = APIClient(wml_credentials)


space_id = 'PASTE YOUR SPACE ID HERE'


client.spaces.list(limit=10)


client.set.default_space(space_id)


try:
    from pyspark.sql import SparkSession
except:
    print('Error: Spark runtime is missing. If you are using Watson Studio change the notebook runtime to Spark.')
    raise


!pip install wget


import os
from wget import download

sample_dir = 'spark_sample_model'
if not os.path.isdir(sample_dir):
    os.mkdir(sample_dir)
    
filename = os.path.join(sample_dir, 'WA_FnUseC_TelcoCustomerChurn.csv')
if not os.path.isfile(filename):
    filename = download("https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/data/customer_churn/WA_FnUseC_TelcoCustomerChurn.csv", out=sample_dir)


spark = SparkSession.builder.getOrCreate()

df_data = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .option('nanValue', ' ')\
  .option('nullValue', ' ')\
  .load(filename)


df_data.printSchema()

print("Number of fields: %3g" % len(df_data.schema))


print("Total number of records: " + str(df_data.count()))


df_complete = df_data.dropna()

print("Number of records with complete data: %3g" % df_complete.count())


df_complete.groupBy('Churn').count().show()


(train_data, test_data, predict_data) = df_complete.randomSplit([0.8, 0.18, 0.02], 24)

print("Number of records for training: " + str(train_data.count()))
print("Number of records for evaluation: " + str(test_data.count()))
print("Number of records for prediction: " + str(predict_data.count()))


from pyspark.ml.feature import StringIndexer, IndexToString, RFormula
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model


lab = StringIndexer(inputCol = 'Churn', outputCol = 'label')
features = RFormula(formula = "~ gender + SeniorCitizen +  Partner + Dependents + tenure + PhoneService + MultipleLines + InternetService + OnlineSecurity + OnlineBackup + DeviceProtection + TechSupport + StreamingTV + StreamingMovies + Contract + PaperlessBilling + PaymentMethod + MonthlyCharges + TotalCharges - 1")


lr = LogisticRegression(maxIter = 10)


pipeline_lr = Pipeline(stages = [features, lab , lr])


model_lr = pipeline_lr.fit(train_data)


predictions = model_lr.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print("Test dataset:")
print("Accuracy = %3.2f" % accuracy)


import ibm_boto3
from ibm_botocore.client import Config


cos_credentials = {
                  "apikey": "***",
                  "cos_hmac_keys": {
                    "access_key_id": "***",
                    "secret_access_key": "***"
                  },
                  "endpoints": "***",
                  "iam_apikey_description": "***",
                  "iam_apikey_name": "***",
                  "iam_role_crn": "***",
                  "iam_serviceid_crn": "***",
                  "resource_instance_id": "***"
                }


connection_apikey = cos_credentials['apikey']
connection_resource_instance_id = cos_credentials["resource_instance_id"]
connection_access_key_id = cos_credentials['cos_hmac_keys']['access_key_id']
connection_secret_access_key = cos_credentials['cos_hmac_keys']['secret_access_key']


service_endpoint = 'https://s3.us.cloud-object-storage.appdomain.cloud'


auth_endpoint = 'https://iam.cloud.ibm.com/identity/token'


cos = ibm_boto3.resource('s3',
                         ibm_api_key_id=cos_credentials['apikey'],
                         ibm_service_instance_id=cos_credentials['resource_instance_id'],
                         ibm_auth_endpoint=auth_endpoint,
                         config=Config(signature_version='oauth'),
                         endpoint_url=service_endpoint)


from uuid import uuid4

bucket_uid = str(uuid4())

score_filename = "WA_FnUseC_TelcoCustomerChurn.csv"
buckets = ["churn-" + bucket_uid]


for bucket in buckets:
    if not cos.Bucket(bucket) in cos.buckets.all():
        print('Creating bucket "{}"...'.format(bucket))
        try:
            cos.create_bucket(Bucket=bucket)
        except ibm_boto3.exceptions.ibm_botocore.client.ClientError as e:
            print('Error: {}.'.format(e.response['Error']['Message']))


bucket_obj = cos.Bucket(buckets[0])

print('Uploading data {}...'.format(score_filename))
with open(filename, 'rb') as f:
    bucket_obj.upload_fileobj(f, score_filename)
print('{} is uploaded.'.format(score_filename))


datasource_type = client.connections.get_datasource_type_uid_by_name('bluemixcloudobjectstorage')

conn_meta_props= {
    client.connections.ConfigurationMetaNames.NAME: "COS connection - spark",
    client.connections.ConfigurationMetaNames.DATASOURCE_TYPE: datasource_type,
    client.connections.ConfigurationMetaNames.PROPERTIES: {
        'bucket': buckets[0],
        'access_key': connection_access_key_id,
        'secret_key': connection_secret_access_key,
        'iam_url': auth_endpoint,
        'url': service_endpoint
    }
}

conn_details = client.connections.create(meta_props=conn_meta_props)


connection_id = client.connections.get_uid(conn_details)


training_data_references = [
                {
                    "id": "customer churn",
                    "type": "connection_asset",
                    "connection": {
                        "id": connection_id
                    },
                    "location": {
                        "bucket": buckets[0],
                        "file_name": score_filename,
                    }
                }
            ]


saved_model = client.repository.store_model(
    model=model_lr, 
    meta_props={
        client.repository.ModelMetaNames.NAME:'Customer Churn model',
        client.repository.ModelMetaNames.TYPE: "mllib_3.3",
        client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: client.software_specifications.get_id_by_name('spark-mllib_3.3'),
        client.repository.ModelMetaNames.TRAINING_DATA_REFERENCES: training_data_references,
        client.repository.ModelMetaNames.LABEL_FIELD: "Churn",
    },  
    training_data=train_data, 
    pipeline=pipeline_lr)


published_model_ID = client.repository.get_model_id(saved_model)

print("Model Id: " + str(published_model_ID))


loaded_model = client.repository.load(published_model_ID)


print(type(loaded_model))


predictions = loaded_model.transform(predict_data)


predictions.show(5, truncate=False, vertical=True)


predictions.select("prediction").groupBy("prediction").count().show(truncate=False)


import os
from wget import download

sample_dir = 'spark_sample_model'
if not os.path.isdir(sample_dir):
    os.mkdir(sample_dir)
    
filename = os.path.join(sample_dir, 'scoreInput.csv')
if not os.path.isfile(filename):
    filename = download("https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/data/customer_churn/scoreInput.csv", out=sample_dir)


meta_data = {
    client.deployments.ConfigurationMetaNames.NAME: "Customer Churn batch deployment",
    client.deployments.ConfigurationMetaNames.BATCH: {},
    client.deployments.ConfigurationMetaNames.HARDWARE_SPEC: {
        "name": "S",
        "num_nodes": 1
    }
}

deployment_details = client.deployments.create(published_model_ID, meta_props=meta_data)


deployment_uid = client.deployments.get_uid(deployment_details)


client.deployments.list()


client.deployments.get_details(deployment_uid)


import pandas as pd

score_input = pd.read_csv(filename).astype('object')


job_payload_ref = {
    client.deployments.ScoringMetaNames.INPUT_DATA: [
        {
            "fields": score_input.columns.tolist(),
            "values": [score_input.loc[0].tolist()]
        }
    ]
}


job = client.deployments.create_job(deployment_uid, meta_props=job_payload_ref)


job_id = client.deployments.get_job_uid(job)


client.deployments.list_jobs()


client.deployments.get_job_details(job_id)


import time

elapsed_time = 0
while client.deployments.get_job_status(job_id).get('state') != 'completed' and elapsed_time < 300:
    print(f" Current state: {client.deployments.get_job_status(job_id).get('state')}")
    elapsed_time += 10
    time.sleep(10)
if client.deployments.get_job_status(job_id).get('state') == 'completed':
    print(f" Current state: {client.deployments.get_job_status(job_id).get('state')}")
    job_details_do = client.deployments.get_job_details(job_id)
    print(job_details_do)
else:
    print("Job hasn't completed successfully in 5 minutes.")


import json

print(json.dumps(client.deployments.get_job_details(job_id), indent=1))

Use Spark and batch deployment to predict customer churn with `ibm-watson-machine-learning`¶

Learning goals¶

Contents¶

1. Set up the environment¶

Connection to WML¶

Install and import the `ibm-watson-machine-learning` package¶

Working with spaces¶

Test Spark¶

2. Load and explore data¶

3. Create an Apache® Spark machine learning model¶

3.1: Prepare data¶

3.2: Create pipeline and train a model¶

4. Persist model¶

Save training data in your Cloud Object Storage¶

Create connections to a COS bucket¶

4.1: Save pipeline and model¶

4.2: Load model¶

5. Predict locally¶

5.1: Make local prediction using previously loaded model and test data¶

6. Deploy model and score in Cloud¶

6.1 Prepare scoring data for batch job¶

Get data for prediction¶

6.2: Create batch deployment¶

Create batch deployment for published model¶

Create and run Batch job¶

Monitor job execution¶

7. Clean up¶

8. Summary and next steps¶

Author¶

Use Spark and batch deployment to predict customer churn with ibm-watson-machine-learning¶

Learning goals¶

Contents¶

1. Set up the environment¶

Connection to WML¶

Install and import the ibm-watson-machine-learning package¶

Working with spaces¶

Test Spark¶

2. Load and explore data¶

3. Create an Apache® Spark machine learning model¶

3.1: Prepare data¶

3.2: Create pipeline and train a model¶

4. Persist model¶

Save training data in your Cloud Object Storage¶

Create connections to a COS bucket¶

4.1: Save pipeline and model¶

4.2: Load model¶

5. Predict locally¶

5.1: Make local prediction using previously loaded model and test data¶

6. Deploy model and score in Cloud¶

6.1 Prepare scoring data for batch job¶

Get data for prediction¶

6.2: Create batch deployment¶

Create batch deployment for published model¶

Create and run Batch job¶

Monitor job execution¶

7. Clean up¶

8. Summary and next steps¶

Author¶

Use Spark and batch deployment to predict customer churn with `ibm-watson-machine-learning`¶

Install and import the `ibm-watson-machine-learning` package¶