# Note, on Watson Studio the pip magic command `%pip` is not supported from within the notebook.  Use !pip instead. 
!pip install --user conversation_analytics_toolkit


import nltk
nltk.download('words')
nltk.download('punkt')
nltk.download('stopwords')


import conversation_analytics_toolkit
from conversation_analytics_toolkit import wa_assistant_skills
from conversation_analytics_toolkit import transformation
from conversation_analytics_toolkit import filtering2 as filtering
from conversation_analytics_toolkit import analysis 
from conversation_analytics_toolkit import visualization 
from conversation_analytics_toolkit import selection as vis_selection
from conversation_analytics_toolkit import wa_adaptor 
from conversation_analytics_toolkit import transcript 
from conversation_analytics_toolkit import flows 
from conversation_analytics_toolkit import keyword_analysis 
from conversation_analytics_toolkit import sentiment_analysis 

import json
import pandas as pd
from pandas.io.json import json_normalize
from IPython.core.display import display, HTML


# set pandas to show more rows and columns
import pandas as pd
#pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', None)


# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
# from project_lib import Project
# project = Project(project_id='', project_access_token='')


import ibm_watson
from ibm_watson import AssistantV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

authenticator = IAMAuthenticator("YOUR API KEY") # Add API key
# service = AssistantV1(version='2019-02-28',authenticator = authenticator)
service = AssistantV1(version='2021-06-14',authenticator = authenticator)
service.set_service_url("SERVICE URL") # Add service URL, for example "https://api.us-south.assistant.watson.cloud.ibm.com"


#select a workspace by specific id
#workspace_id = '' # Add workspace ID
# or fetch one via the APIs
workspaces=service.list_workspaces().get_result()
workspace_id = workspaces['workspaces'][0]['workspace_id']

#fetch the workspace
workspace=service.get_workspace(
    workspace_id=workspace_id,
    export=True
).get_result()

# set query parameters
limit_number_of_records=5000
# example of time range query
query_filter = "response_timestamp>=2019-10-30,response_timestamp<2019-10-31"
#query_filter = None
# Fetch the logs for the workspace
df_logs = wa_adaptor.read_logs(service, workspace_id, limit_number_of_records, query_filter)


import requests
# this example uses Watson Assistant data sample on github

# pull sample workspace from watson developer cloud
response = requests.get("https://raw.githubusercontent.com/watson-developer-cloud/assistant-dialog-flow-analysis/master/data/banking-sample/wa-workspace.json").text    
workspace = json.loads(response)

# NOTE: the workspace_id is typically available inside the workspace object.  
# If you've used the `export skill` feature in Watson Assistant UI, you can find the skill id 
# by clicking the `skill`-->`View API details` and copying the value of skill_id  
workspace_id = workspace["workspace_id"] 
#workpace_id = ''

# pull logs sample from watson develop cloud
response = requests.get("https://raw.githubusercontent.com/watson-developer-cloud/assistant-dialog-flow-analysis/master/data/banking-sample/wa-logs.json").text
df_logs = pd.DataFrame.from_records(json.loads(response))
print("loaded {} log records".format(str(len(df_logs))))


# @hidden_cell
# The project token is an authorization token that is used by Watson Studio to access project resources. 
# For more details on project tokens, refer to https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/token.html
# ======
# from project_lib import Project
# project = Project(project_id='f4240f6e-bc9a-4ca8-a32f-a7f3331062b3', project_access_token='p-4ada4354712b8f898327adb616e2b7a51b82889d')

# workspace_file = "wa-workspace.json"
# log_files = "wa-logs.json"

# workspace = json.loads(project.get_file(workspace_file))
# df_logs = pd.DataFrame.from_records(json.loads(project.get_file(log_files)))


# Depending on your production environment, your logs and workspace files might be stored in different locations.
# such as NoSQL databases, Cloud Object Storage files, etc.

# use custom code here, and make sure you load the workspace as a python dictionary, and the df_logs as a pandas DataFrame.
#workspace = 
#df_logs =


#if you have more than one skill, you can add multiple skill definitions
skill_id = workspace_id
assistant_skills = wa_assistant_skills.WA_Assistant_Skills()
assistant_skills.add_skill(skill_id, workspace)
#validate the number of workspace_ids
print("workspace_ids in skills: " + pd.DataFrame(assistant_skills.list_skills())["skill_id"].unique())
print("workspace_ids in logs: "+ df_logs.workspace_id.unique())


df_logs_canonical = transformation.to_canonical_WA_v2(df_logs, assistant_skills, skill_id_field=None, include_nodes_visited_str_types=True)
#df_logs_canonical = transformation.to_canonical_WA_v2(df_logs, assistant_skills, skill_id_field="workspace_id", include_nodes_visited_str_types=True)


# the rest of the notebook runs on the df_logs_to_analyze object.  
df_logs_to_analyze = df_logs_canonical.copy(deep=False)
df_logs_to_analyze.head(2)


title = "All Conversations"
turn_based_path_flows = analysis.aggregate_flows(df_logs_to_analyze, mode="turn-based", on_column="turn_label", max_depth=400, trim_reroutes=False)
# increase the width of the Jupyter output cell   
display(HTML("<style>.container { width:95% !important; }</style>"))
config = {
    'commonRootPathName': title, # label for the first root node 
    'height': 800, # control the visualization height.  Default 600
    'nodeWidth': 250, 
    'maxChildrenInNode': 6, # control the number of immediate children to show (and collapse rest into *others* node).  Default 5
    'linkWidth' : 400,  # control the width between pathflow layers.  Default 360     'sortByAttribute': 'flowRatio'  # control the sorting of the chart. (Options: flowRatio, dropped_offRatio, flows, dropped_off, rerouted)
    'sortByAttribute': 'flowRatio',
    'title': title,
    'mode': "turn-based"
}
jsondata = json.loads(turn_based_path_flows.to_json(orient='records'))
visualization.draw_flowchart(config, jsondata, python_selection_var="selection")


# filter the conversations that include escalation
title2="Banking Card Escalated"
filters = filtering.ChainFilter(df_logs_to_analyze).setDescription(title2) 
# node with condition on the #Banking-Card_Selection (node_1_1510880732839) and visit the node "Transfer To Live Agent" (node_25_1516679473977)
filters.by_dialog_node_id('node_1_1510880732839')\
       .by_dialog_node_id('node_25_1516679473977')
filters.printConversationFilters() 
# get a reference to the dataframe.  Note: you can get access to intermediate dataframes by calling getDataFrame(index)


filtered_df = filters.getDataFrame()
turn_based_path_flows = analysis.aggregate_flows(filtered_df, mode="turn-based", on_column="turn_label", max_depth=400, trim_reroutes=False)  
config = {
    'commonRootPathName': title2,  'title': title2,
    'height': 800,  'nodeWidth': 250, 'maxChildrenInNode': 6, 'linkWidth' : 400, 'sortByAttribute': 'flowRatio',
    'mode': "turn-based"
}
jsondata = json.loads(turn_based_path_flows.to_json(orient='records'))
visualization.draw_flowchart(config, jsondata, python_selection_var="selection")


#define the milestones and corresponding node ids for the `Schedule Appointment` task
milestone_analysis = analysis.MilestoneFlowGraph(assistant_skills.get_skill_by_id(skill_id))

milestone_analysis.add_milestones(["Appointment scheduling start",  "Schedule time", "Enter zip code", "Branch selection", 
                    "Enter purpose of appointment", "Scheduling completion"])

milestone_analysis.add_node_to_milestone("node_21_1513047983871", "Appointment scheduling start")   
milestone_analysis.add_node_to_milestone("handler_28_1513048122602", "Schedule time")
milestone_analysis.add_node_to_milestone("handler_31_1513048234102", "Enter zip code") 
milestone_analysis.add_node_to_milestone("node_3_1517200453933", "Branch selection") 
 
milestone_analysis.add_node_to_milestone("node_41_1513049128006", "Enter purpose of appointment")
milestone_analysis.add_node_to_milestone("node_43_1513049260736", "Scheduling completion")


#enrich with milestone information - will add a column called 'milestone'
milestone_analysis.enrich_milestones(df_logs_to_analyze)
#remove all log records without a milestone
df_milestones = df_logs_to_analyze[pd.isna(df_logs_to_analyze["milestone"]) == False]
#optionally, remove consecutive milestones for a more simplified flow visualization representation
df_milestones = analysis.simplify_flow_consecutive_milestones(df_milestones)


# compute the aggregate flows of milestones 
computed_flows= analysis.aggregate_flows(df_milestones, mode="milestone-based", on_column="milestone", max_depth=30, trim_reroutes=False)
config = {
    'commonRootPathName': 'All Conversations', # label for the first root node 
    'height': 800, # control the visualization height.  Default 600
    'maxChildrenInNode': 6, # control the number of immediate children to show (and collapse the rest into *other* node).  Default 5
#     'linkWidth' : 400,  # control the width between pathflow layers.  Default 360     '
    'sortByAttribute': 'flowRatio', # control the sorting of the chart. (Options: flowRatio, dropped_offRatio, flows, dropped_off, rerouted)
    'title': "Abandoned Conversations in Appointment Schedule Flow",
    'showVisitRatio' : 'fromTotal', # default: 'fromTotal'.  'fromPrevious' will compute percentages from previous step,
    'mode': 'milestone-based'
}
jsondata = json.loads(computed_flows.to_json(orient='records'))
visualization.draw_flowchart(config, jsondata, python_selection_var="milestone_selection")


#the selection variable contains details about the selected node, and conversations that were abandoned at that point
print("Selected Path: ",milestone_selection["path"])
#fetch the dropped off conversations from the selection  
dropped_off_conversations = vis_selection.to_dataframe(milestone_selection)["dropped_off"]
print("The selection contains {} records, with a reference back to the converstion logs".format(str(len(dropped_off_conversations))))
dropped_off_conversations.head()


df_logs_to_analyze = sentiment_analysis.add_sentiment_columns(df_logs_to_analyze) 
#create insights, and highlights annotation for the transcript visualization
NEGATIVE_SENTIMENT_THRESHOLD=-0.15 
df_logs_to_analyze["insights_tags"] = df_logs_to_analyze.apply(lambda x: ["Negative Sentiment"] if x.sentiment < NEGATIVE_SENTIMENT_THRESHOLD else [], axis=1)
df_logs_to_analyze["highlight"] = df_logs_to_analyze.apply(lambda x: True if x.sentiment < NEGATIVE_SENTIMENT_THRESHOLD else False, axis=1)


# fetch the conversation records
dropped_off_conversations = vis_selection.fetch_logs_by_selection(df_logs_to_analyze, dropped_off_conversations)
# visualize using the transcript visualization 
dfc = transcript.to_transcript(dropped_off_conversations)
config = {'debugger': True} 
visualization.draw_transcript(config, dfc)


# gather user utterances from the dropped off conversations - last utterances and all utterances
last_utterances_abandoned=vis_selection.get_last_utterances_from_selection(milestone_selection, df_logs_to_analyze)
all_utterances_abandoned=vis_selection.get_all_utterances_from_selection(milestone_selection, df_logs_to_analyze)


# analyze the last user input before abandonment
num_unigrams=10
num_bigrams=15
custom_stop_words=["would","pm","ok","yes","no","thank","thanks","hi","i","you"]
data = keyword_analysis.get_frequent_words_bigrams(last_utterances_abandoned, num_unigrams,num_bigrams,custom_stop_words)


config = {'flattened': True, 'width' : 800, 'height' : 500}
visualization.draw_wordpackchart(config, data)


# a flow is defined by a name, one or more "starting/parent_nodes" and one or more "success/completion nodes".   
# All the nodes which are descendants to the parent nodes are considered to be part of the flow
# A flow is considered successful if reaches the completion node
flow_defs_initial = {
  'flows': [{
              'name': 'Credit card payment',
              'parent_nodes': ['node_3_1511326054650'], #condition on #Banking-Billing_Payment_Enquiry || #Banking-Billing_Making_Payments
              'completion_nodes': ['node_8_1512531332315'] # Display of confirmation "Thank you for your payment..." 
            },
            {
              'name': 'Schedule appointment',
              'parent_nodes': ['node_21_1513047983871'], #condition on '#Business_Information-Make_Appointment'
              'completion_nodes': ['node_43_1513049260736'] #Display Appointment Confirmation
            }]
}
#create a list of all the nodes that map to a flow including descendant nodes
flow_defs = flows.enrich_flows_by_workspace(flow_defs_initial, workspace)


#enrich the logs dataframe with additional columns ["flow", "flow_state"] that represent the state of the flow
df_logs_to_analyze = flows.enrich_canonical_by_flows(df_logs_to_analyze, flow_defs)
flow_outcome_summary = flows.count_flows(df_logs_to_analyze, flow_defs)
print(flow_outcome_summary) 
flows.plot_flow_outcomes(flow_outcome_summary)


# example of searching for all occurences of the word 'Card'
search_term='Card'
results = assistant_skills.re_search_in_dialog_nodes(search_term)
results.head(5)


workspace = assistant_skills.get_skill_by_id(skill_id)
data = {
    'workspace': workspace
}
config = {}
visualization.draw_wa_dialog_chart(config, data)


import datetime
import pytz

filters = filtering.ChainFilter(df_logs_to_analyze).setDescription("Filter: collect Appointment Data during Jan 2020") 
filters.by_dialog_node_id('node_22_1513048049461') #  corresponding to 'Collect Appointment Data' node. 
 
# You can use the search utilities described earlier in the notebook to find this node
# You can also use cf.by_turn_label('Collect Appointment Data') to filter on information in the turn label

start_date = datetime.datetime(2020, 1, 1, 0, 0, 0, 0, pytz.UTC)
end_date = datetime.datetime(2020, 1, 31, 0, 0, 0, 0, pytz.UTC)
filters.by_date_range(start_date,end_date)
filters.printConversationFilters() 

# get a reference to the dataframe.  Note: you can get access to itermediate dataframes by calling getDataFrame(index)
filtered_df = filters.getDataFrame()
print("number of unique conversations in filtered dataframe: {}".format(len(filtered_df["conversation_id"].unique())))


#get the logs of conversations that continue to successful completion
scheduling_completed_filter = filtering.ChainFilter(df_logs_to_analyze).setDescription("Appointement Scheduling flow - Completed") 
scheduling_completed_filter.by_dialog_node_id('node_21_1513047983871')   # started the Appointment Scheduling flow 
scheduling_completed_filter.by_dialog_node_id('node_3_1517200453933')    # passed through the "Branch selection" node
scheduling_completed_filter.by_dialog_node_id('node_43_1513049260736')  # reached the completion node of Scheduling Appointment flow
scheduling_completed_filter.printConversationFilters()


#get the user utterances 
scheduling_completed_df = scheduling_completed_filter.getDataFrame()
all_utterances_completed=scheduling_completed_df[scheduling_completed_df.request_text!=""].request_text.tolist()
print("Gathered {} utterances from {} successful journeys".format(str(len(all_utterances_completed)), 
                                                                  str(len(scheduling_completed_df["conversation_id"].unique()))))


num_keywords=25
custom_stop_words=["would","pm","ok","yes","no","thank","thanks","hi","i","you"]
data = keyword_analysis.get_data_for_comparison_visual(all_utterances_abandoned, all_utterances_completed, num_keywords,custom_stop_words)


config = {'debugger': True,  'flattened': True, 'width' : 800, 'height' : 600}
visualization.draw_wordpackchart(config, data)


#to export all user utterances in the dropoff point of flow visualization selection  
project.save_data("abandoned-user-utterances.csv", 
    dropped_off_conversations[dropped_off_conversations["request_text"] != ""].to_csv(columns=["request_text"], 
    index=False, header=False))

#to export all user utterances in the dropoff point of flow visualization selection
project.save_data("abandoned-conversation-ids.csv", 
    dropped_off_conversations.to_csv(columns=["conversation_id"], index=False,header=False))

#to export all columns of the canonical model for abandoned conversations  
project.save_data("abandoned-logs.csv", dropped_off_conversations.to_csv(index=False))

#to export specific conversation, e.g. 00KjvlWcGozRTcSYTrlGqj4JYtYH5gjbvw3j
conversation_id_to_export = '00KjvlWcGozRTcSYTrlGqj4JYtYH5gjbvw3j'
project.save_data(conversation_id_to_export + ".csv", 
    df_logs_to_analyze[df_logs_to_analyze["conversation_id"] == conversation_id_to_export].to_csv(index=False))

#to export user utterances for intent training with Watson Recommends
from conversation_analytics_toolkit import export
sentences = dropped_off_conversations[dropped_off_conversations["request_text"] != ""].reset_index()
sentences = sentences[["request_text"]]
sentences.columns = ['example']
filtered_sentences = export.filter_sentences(sentences, min_complexity = 3, max_complexity = 20)
df_sentences = pd.DataFrame(data={"training_examples": filtered_sentences})

project.save_data("utterances-for-Watson-Intent-Recommendations.csv", 
    df_sentences.to_csv(sep=',',index=False, header=False))

Dialog Flow Analysis for Watson Assistant¶

Introduction¶

Prerequisites¶

Table of contents¶

1. Configuration and Setup¶

Install required Python libraries¶

1. Import required modules¶

2. Configure the notebook¶

3. Add the project token¶

2. Load Assistant Skills and Logs¶

2.1 Load from a Watson Assistant instance¶

2.1.1 Add Watson Assistant configuration¶

2.1.2 Fetch and load a workspace¶

2.2 Load option one: from JSON files¶

2.3 Load option two: from IBM Cloud Object Storage (using Watson Studio)¶

2.4 Load option three: from custom location¶

3 Extract and Transform¶

Step 1: Prepare skills¶

Step 2: Extract and Transform¶

4. Visualizing user journeys and abandonments¶

1. Visualize all conversations¶

2. Visualize a subset of conversations or conversation steps¶

4.2 Visualize dialog flow (milestone-based)¶

1. Define milestones¶

2. Derive a new dataset, using enrichment & filtering¶

3. Aggregate and visualize¶

4.3 Select conversations at the point of abandonment¶

5. Analyzing abandoned conversations¶

5.1 Explore conversation transcripts for qualitative analysis¶

Optionally enrich with sentiment information¶

5.2 Identify key words and phrases at point of abandonment¶

Gather utterances from abandoned conversations¶

5.2.1 Summarize frequent keywords and phrases¶

6. Measuring high level tasks of the Assistant¶

1. Define tasks¶

2. Measure task volume and completion rates¶

7. Advanced Topics¶

7.1 Locating important dialog nodes in your assistant¶

7.1.2 Searching programmatically¶

7.1.3 Interactive search and exploration¶

7.2 Filtering¶

7.3 Advanced keyword analysis: Comparing abandoned vs. successful conversations¶

Gather utterances from all conversations that completed the journey on the same flow¶

Outcome analysis: all utterances prior to abandonment vs completed¶

8. Summary and next steps¶

Here are a few useful exports you can use to support above activities¶

Authors¶

Acknowledgement¶

3. Add the project token ¶

Authors ¶

Acknowledgement ¶