In [1]:
import pandas as pd
import umap      # get umap by installing umap-learn
import umap.plot # requires datashader, bokeh, holoviews
from bokeh.plotting import show, save, output_notebook, output_file

Introduction

In this notebook, we will try to visualize occupations based on O-Net data for work interests (RIASEC), work styles and work values.

In [2]:
jobID = 'O*NET-SOC Code'
multicol_names = ['Inventory', 'Constructs']
def multicols(df, inventory):
    multicols = [(inventory, construct) for construct in df.columns]
    return pd.MultiIndex.from_tuples(multicols, names = multicol_names)
In [4]:
#%% Occupation titles and descriptions
titles_descriptions = pd.read_csv('descriptions.csv')
jobs = titles_descriptions[[jobID, 'Title']].set_index(jobID)

Plotting by RIASEC interests

In [20]:
#%% Interests
riasec_long = pd.read_csv('Interests.txt', sep = '\t')
riasec_wide = pd.pivot_table(riasec_long, 
                        values = 'Data Value',  
                        index = [jobID], 
                        columns=['Element Name'])
interests = ['Realistic','Investigative','Artistic', 'Social','Enterprising','Conventional']
riasec = riasec_wide[interests]
riasec.columns = multicols(riasec, 'Interests')

riasec = riasec.rename(index = dict(jobs['Title']))
riasec.index.names = ['Title']

riasec
Out[20]:
Inventory Interests
Constructs Realistic Investigative Artistic Social Enterprising Conventional
Title
Chief Executives 1.33 2.00 2.67 3.67 7.00 5.33
Chief Sustainability Officers 1.00 4.33 2.67 2.33 7.00 4.33
General and Operations Managers 1.33 1.33 1.00 3.33 7.00 3.67
Legislators 1.00 3.67 3.67 4.67 7.00 3.00
Advertising and Promotions Managers 1.67 2.00 5.33 2.33 7.00 4.67
... ... ... ... ... ... ...
Pump Operators, Except Wellhead Pumpers 7.00 4.00 1.00 1.33 2.00 4.67
Wellhead Pumpers 7.00 3.67 1.00 1.00 1.33 5.00
Refuse and Recyclable Material Collectors 7.00 1.33 1.00 1.00 2.33 3.67
Mine Shuttle Car Operators 7.00 1.33 1.00 1.00 1.67 2.33
Tank Car, Truck, and Ship Loaders 7.00 3.00 1.00 1.33 2.00 5.00

974 rows × 6 columns

In [21]:
df = riasec

hover_data = pd.DataFrame({'Title':list(df.index)})
mapper = umap.UMAP().fit(df)

plot = umap.plot.interactive(mapper, hover_data=hover_data)
output_notebook()
umap.plot.show(plot)
Loading BokehJS ...

This is quite interesting. The occupations are not distributed evenly among RIASEC scores, and there are some very clear clusters. The clustering is also quite intuitive.

Plotting by Work Styles

In [7]:
#%% Styles
styles_long = pd.read_csv('Work Styles.txt', sep = '\t')
styles_wide = pd.pivot_table(styles_long, 
                        values = 'Data Value',  
                        index = [jobID], 
                        columns=['Element Name'])
styles = styles_wide
styles.columns = multicols(styles,'Styles')

styles = styles.rename(index = dict(jobs['Title']))
styles.index.names = ['Title']

styles
Out[7]:
Inventory Styles
Constructs Achievement/Effort Adaptability/Flexibility Analytical Thinking Attention to Detail Concern for Others Cooperation Dependability Independence Initiative Innovation Integrity Leadership Persistence Self Control Social Orientation Stress Tolerance
Title
Chief Executives 4.55 4.20 4.45 4.49 4.22 4.44 4.65 4.54 4.75 4.27 4.80 4.97 4.43 4.52 3.67 4.75
Chief Sustainability Officers 4.19 4.23 4.31 4.12 3.48 4.32 4.23 4.27 4.60 4.38 4.58 4.64 4.31 4.00 3.35 4.08
General and Operations Managers 4.18 4.09 4.03 4.25 4.06 4.16 4.54 3.99 4.35 3.65 4.30 4.59 4.04 4.31 3.57 4.33
Advertising and Promotions Managers 4.10 4.42 3.88 4.48 3.81 4.51 4.51 4.04 4.56 3.99 4.17 4.03 4.32 4.18 3.70 4.35
Marketing Managers 4.32 4.31 4.00 4.32 3.37 4.08 4.35 3.88 4.53 4.15 4.17 4.37 4.23 3.87 3.80 4.01
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Pump Operators, Except Wellhead Pumpers 2.81 3.97 3.19 3.96 3.95 4.03 4.27 3.19 3.50 2.40 3.55 3.58 2.63 3.99 3.22 4.51
Wellhead Pumpers 3.89 3.89 3.71 4.32 3.99 4.03 4.64 4.06 4.23 3.49 4.27 3.54 3.86 4.15 2.90 3.95
Refuse and Recyclable Material Collectors 3.78 4.01 3.37 4.08 4.06 4.08 4.64 4.25 3.76 3.56 4.29 3.77 3.98 4.45 2.86 4.34
Mine Shuttle Car Operators 3.27 3.14 2.45 2.92 3.34 3.43 3.64 3.20 3.29 2.54 3.18 2.27 3.30 3.41 3.00 3.37
Tank Car, Truck, and Ship Loaders 3.50 3.94 3.53 4.37 3.89 4.14 4.48 3.77 3.66 3.22 3.95 3.39 3.45 3.81 3.40 3.69

967 rows × 16 columns

In [19]:
df = styles

hover_data = pd.DataFrame({'Title':list(df.index)})
mapper = umap.UMAP().fit(df)

plot = umap.plot.interactive(mapper, hover_data=hover_data)
output_notebook()
umap.plot.show(plot)
Loading BokehJS ...

The occupations are spread much more evenly than was the case with work interests.

The three "apexes" of the triangle are:

  • Top: quantitative/creative? (Poets are near the top, along with Mathematicians, for example)
  • Bottom left: medical/educational
  • Bottom right: factory work

Plotting by Work Values

In [9]:
#%% Values
values_long = pd.read_csv('Work Values.txt', sep = '\t')
values_wide = pd.pivot_table(values_long, 
                        values = 'Data Value',  
                        index = [jobID], 
                        columns=['Element Name'])
values = values_wide[['Achievement', 'Independence', 'Recognition', 'Relationships', 'Support', 'Working Conditions']]
values.columns = multicols(values,'Values')

values = values.rename(index = dict(jobs['Title']))
values.index.names = ['Title']

values
Out[9]:
Inventory Values
Constructs Achievement Independence Recognition Relationships Support Working Conditions
Title
Chief Executives 6.33 7.00 7.00 5.00 5.33 6.33
Chief Sustainability Officers 6.67 6.67 6.00 5.00 3.33 6.33
General and Operations Managers 5.33 6.00 5.67 6.33 4.67 6.00
Legislators 5.33 5.00 5.00 5.67 4.00 4.33
Advertising and Promotions Managers 5.33 5.33 5.33 5.00 4.00 5.33
... ... ... ... ... ... ...
Pump Operators, Except Wellhead Pumpers 2.33 3.67 2.33 3.67 5.67 3.17
Wellhead Pumpers 2.33 3.00 2.33 4.33 5.67 3.17
Refuse and Recyclable Material Collectors 2.00 2.33 1.33 4.00 4.33 2.83
Mine Shuttle Car Operators 2.00 2.67 2.00 3.00 5.67 2.83
Tank Car, Truck, and Ship Loaders 2.67 5.00 2.33 4.67 6.00 3.00

974 rows × 6 columns

In [10]:
df = values

hover_data = pd.DataFrame({'Title':list(df.index)})
mapper = umap.UMAP().fit(df)

plot = umap.plot.interactive(mapper, hover_data=hover_data)
output_notebook()
umap.plot.show(plot)
Loading BokehJS ...

Again, quite uniformly spread, like the plot for work styles.

This time, the apexes of the triangle are:

  • Top: medical and mangerial
  • Bottom left: engineering
  • Bottom right: cleaners, mail sorters etc.

Plotting by all three combined

In [22]:
df =  riasec.join(styles).join(values).dropna()

hover_data = pd.DataFrame({'Title':list(df.index)})
mapper = umap.UMAP().fit(df)

plot = umap.plot.interactive(mapper, hover_data=hover_data)
output_notebook()
umap.plot.show(plot)
Loading BokehJS ...

A mix of all three. This has similar clusters of the work interests plot, but also has some influence from work styles and work values. Quite an interesting visualization, which does seem to agree with intuition, for the most part.

Further directions

The above visualizations use the Euclidean metric for deciding how close vocations should be to each other (based on interests, styles and values). It might be interesting trying alternative metrics.

It would also be nice to be able to hover over a node, and immediately get edges connecting it to the top 10 nearest neighbours, say. These nearest neighbours may not be the nearest ones visually, since the projection does distort some distances.

As an application, we could get users to do the tests for work intersts, styles and values, and then plot the user as a point in the above space, so the user can see where they land, and what are the closest jobs to them.

It would also be useful to colour the nodes somehow, but we need to choose a meanginful measure to colour them by. One possibility is to use what O-Net calls "Job Zones". There are 5 job zones, indicating how much preparation/expertise is required for the job, with 1 being little preparation require, and 5 being years of study or experience.

We also have not incorporated skills or job descriptions yet.