Compute l-diversity with Cloud DLP. L-diversity, which is an extension of k-anonymity, measures the diversity of sensitive values for each column in which they occur. A dataset has l-diversity if, for every set of rows with identical quasi-identifiers, there are at least l distinct values for each sensitive attribute.
Explore further
For detailed documentation that includes this code sample, see the following:
Code sample
C#
To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries .
To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .
using
Google.Api.Gax.ResourceNames
;
using
Google.Cloud.Dlp.V2
;
using
Google.Cloud.PubSub.V1
;
using
Newtonsoft.Json
;
using
System
;
using
System.Collections.Generic
;
using
System.Linq
;
using
System.Threading
;
using
System.Threading.Tasks
;
using
static
Google
.
Cloud
.
Dlp
.
V2
.
Action
.
Types
;
using
static
Google
.
Cloud
.
Dlp
.
V2
.
PrivacyMetric
.
Types
;
public
class
RiskAnalysisCreateLDiversity
{
public
static
object
LDiversity
(
string
callingProjectId
,
string
tableProjectId
,
string
datasetId
,
string
tableId
,
string
topicId
,
string
subscriptionId
,
IEnumerable<FieldId>
quasiIds
,
string
sensitiveAttribute
)
{
var
dlp
=
DlpServiceClient
.
Create
();
// Construct + submit the job
var
ldiversityConfig
=
new
LDiversityConfig
{
SensitiveAttribute
=
new
FieldId
{
Name
=
sensitiveAttribute
},
QuasiIds
=
{
quasiIds
}
};
var
config
=
new
RiskAnalysisJobConfig
{
PrivacyMetric
=
new
PrivacyMetric
{
LDiversityConfig
=
ldiversityConfig
},
SourceTable
=
new
BigQueryTable
{
ProjectId
=
tableProjectId
,
DatasetId
=
datasetId
,
TableId
=
tableId
},
Actions
=
{
new
Google
.
Cloud
.
Dlp
.
V2
.
Action
{
PubSub
=
new
PublishToPubSub
{
Topic
=
$"projects/{callingProjectId}/topics/{topicId}"
}
}
}
};
var
submittedJob
=
dlp
.
CreateDlpJob
(
new
CreateDlpJobRequest
{
ParentAsProjectName
=
new
ProjectName
(
callingProjectId
),
RiskJob
=
config
});
// Listen to pub/sub for the job
var
subscriptionName
=
new
SubscriptionName
(
callingProjectId
,
subscriptionId
);
var
subscriber
=
SubscriberClient
.
CreateAsync
(
subscriptionName
).
Result
;
// SimpleSubscriber runs your message handle function on multiple
// threads to maximize throughput.
var
done
=
new
ManualResetEventSlim
(
false
);
subscriber
.
StartAsync
((
PubsubMessage
message
,
CancellationToken
cancel
)
=
>
{
if
(
message
.
Attributes
[
"DlpJobName"
]
==
submittedJob
.
Name
)
{
Thread
.
Sleep
(
500
);
// Wait for DLP API results to become consistent
done
.
Set
();
return
Task
.
FromResult
(
SubscriberClient
.
Reply
.
Ack
);
}
else
{
return
Task
.
FromResult
(
SubscriberClient
.
Reply
.
Nack
);
}
});
done
.
Wait
(
TimeSpan
.
FromMinutes
(
10
));
// 10 minute timeout; may not work for large jobs
subscriber
.
StopAsync
(
CancellationToken
.
None
).
Wait
();
// Process results
var
resultJob
=
dlp
.
GetDlpJob
(
new
GetDlpJobRequest
{
DlpJobName
=
DlpJobName
.
Parse
(
submittedJob
.
Name
)
});
var
result
=
resultJob
.
RiskDetails
.
LDiversityResult
;
for
(
var
bucketIdx
=
0
;
bucketIdx
<
result
.
SensitiveValueFrequencyHistogramBuckets
.
Count
;
bucketIdx
++
)
{
var
bucket
=
result
.
SensitiveValueFrequencyHistogramBuckets
[
bucketIdx
];
Console
.
WriteLine
(
$"Bucket {bucketIdx}"
);
Console
.
WriteLine
(
$" Bucket size range: [{bucket. SensitiveValueFrequencyLowerBound
}, {bucket. SensitiveValueFrequencyUpperBound
}]."
);
Console
.
WriteLine
(
$" {bucket.BucketSize} unique value(s) total."
);
foreach
(
var
bucketValue
in
bucket
.
BucketValues
)
{
// 'UnpackValue(x)' is a prettier version of 'x.toString()'
Console
.
WriteLine
(
$" Quasi-ID values: [{String.Join(',', bucketValue.QuasiIdsValues.Select(x => UnpackValue(x)))}]"
);
Console
.
WriteLine
(
$" Class size: {bucketValue.EquivalenceClassSize}"
);
foreach
(
var
topValue
in
bucketValue
.
TopSensitiveValues
)
{
Console
.
WriteLine
(
$" Sensitive value {UnpackValue(topValue. Value
)} occurs {topValue.Count} time(s)."
);
}
}
}
return
result
;
}
public
static
string
UnpackValue
(
Value
protoValue
)
{
var
jsonValue
=
JsonConvert
.
DeserializeObject<Dictionary<string
,
object
>> (
protoValue
.
ToString
());
return
jsonValue
.
Values
.
ElementAt
(
0
).
ToString
();
}
}
Go
To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries .
To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .
import
(
"context"
"fmt"
"io"
"strings"
"time"
dlp
"cloud.google.com/go/dlp/apiv2"
"cloud.google.com/go/dlp/apiv2/dlppb"
"cloud.google.com/go/pubsub"
)
// riskLDiversity computes the L Diversity of the given columns.
func
riskLDiversity
(
w
io
.
Writer
,
projectID
,
dataProject
,
pubSubTopic
,
pubSubSub
,
datasetID
,
tableID
,
sensitiveAttribute
string
,
columnNames
...
string
)
error
{
// projectID := "my-project-id"
// dataProject := "bigquery-public-data"
// pubSubTopic := "dlp-risk-sample-topic"
// pubSubSub := "dlp-risk-sample-sub"
// datasetID := "nhtsa_traffic_fatalities"
// tableID := "accident_2015"
// sensitiveAttribute := "city"
// columnNames := "state_number", "county"
ctx
:=
context
.
Background
()
client
,
err
:=
dlp
.
NewClient
(
ctx
)
if
err
!=
nil
{
return
fmt
.
Errorf
(
"dlp.NewClient: %w"
,
err
)
}
defer
client
.
Close
()
// Create a PubSub Client used to listen for when the inspect job finishes.
pubsubClient
,
err
:=
pubsub
.
NewClient
(
ctx
,
projectID
)
if
err
!=
nil
{
return
err
}
defer
pubsubClient
.
Close
()
// Create a PubSub subscription we can use to listen for messages.
// Create the Topic if it doesn't exist.
t
:=
pubsubClient
.
Topic
(
pubSubTopic
)
topicExists
,
err
:=
t
.
Exists
(
ctx
)
if
err
!=
nil
{
return
err
}
if
!
topicExists
{
if
t
,
err
=
pubsubClient
.
CreateTopic
(
ctx
,
pubSubTopic
);
err
!=
nil
{
return
err
}
}
// Create the Subscription if it doesn't exist.
s
:=
pubsubClient
.
Subscription
(
pubSubSub
)
subExists
,
err
:=
s
.
Exists
(
ctx
)
if
err
!=
nil
{
return
err
}
if
!
subExists
{
if
s
,
err
=
pubsubClient
.
CreateSubscription
(
ctx
,
pubSubSub
,
pubsub
.
SubscriptionConfig
{
Topic
:
t
});
err
!=
nil
{
return
err
}
}
// topic is the PubSub topic string where messages should be sent.
topic
:=
"projects/"
+
projectID
+
"/topics/"
+
pubSubTopic
// Build the QuasiID slice.
var
q
[]
*
dlppb
.
FieldId
for
_
,
c
:=
range
columnNames
{
q
=
append
(
q
,
& dlppb
.
FieldId
{
Name
:
c
})
}
// Create a configured request.
req
:=
& dlppb
.
CreateDlpJobRequest
{
Parent
:
fmt
.
Sprintf
(
"projects/%s/locations/global"
,
projectID
),
Job
:
& dlppb
.
CreateDlpJobRequest_RiskJob
{
RiskJob
:
& dlppb
.
RiskAnalysisJobConfig
{
// PrivacyMetric configures what to compute.
PrivacyMetric
:
& dlppb
.
PrivacyMetric
{
Type
:
& dlppb
.
PrivacyMetric_LDiversityConfig_
{
LDiversityConfig
:
& dlppb
.
PrivacyMetric_LDiversityConfig
{
QuasiIds
:
q
,
SensitiveAttribute
:
& dlppb
.
FieldId
{
Name
:
sensitiveAttribute
,
},
},
},
},
// SourceTable describes where to find the data.
SourceTable
:
& dlppb
.
BigQueryTable
{
ProjectId
:
dataProject
,
DatasetId
:
datasetID
,
TableId
:
tableID
,
},
// Send a message to PubSub using Actions.
Actions
:
[]
*
dlppb
.
Action
{
{
Action
:
& dlppb
.
Action_PubSub
{
PubSub
:
& dlppb
.
Action_PublishToPubSub
{
Topic
:
topic
,
},
},
},
},
},
},
}
// Create the risk job.
j
,
err
:=
client
.
CreateDlpJob
(
ctx
,
req
)
if
err
!=
nil
{
return
fmt
.
Errorf
(
"CreateDlpJob: %w"
,
err
)
}
fmt
.
Fprintf
(
w
,
"Created job: %v\n"
,
j
.
GetName
())
// Wait for the risk job to finish by waiting for a PubSub message.
// This only waits for 10 minutes. For long jobs, consider using a truly
// asynchronous execution model such as Cloud Functions.
ctx
,
cancel
:=
context
.
WithTimeout
(
ctx
,
10
*
time
.
Minute
)
defer
cancel
()
err
=
s
.
Receive
(
ctx
,
func
(
ctx
context
.
Context
,
msg
*
pubsub
.
Message
)
{
// If this is the wrong job, do not process the result.
if
msg
.
Attributes
[
"DlpJobName"
]
!=
j
.
GetName
()
{
msg
.
Nack
()
return
}
msg
.
Ack
()
time
.
Sleep
(
500
*
time
.
Millisecond
)
j
,
err
:=
client
.
GetDlpJob
(
ctx
,
& dlppb
.
GetDlpJobRequest
{
Name
:
j
.
GetName
(),
})
if
err
!=
nil
{
fmt
.
Fprintf
(
w
,
"GetDlpJob: %v"
,
err
)
return
}
h
:=
j
.
GetRiskDetails
().
GetLDiversityResult
().
GetSensitiveValueFrequencyHistogramBuckets
()
for
i
,
b
:=
range
h
{
fmt
.
Fprintf
(
w
,
"Histogram bucket %v\n"
,
i
)
fmt
.
Fprintf
(
w
,
" Size range: [%v,%v]\n"
,
b
.
GetSensitiveValueFrequencyLowerBound
(),
b
.
GetSensitiveValueFrequencyUpperBound
())
fmt
.
Fprintf
(
w
,
" %v unique values total\n"
,
b
.
GetBucketSize
())
for
_
,
v
:=
range
b
.
GetBucketValues
()
{
var
qvs
[]
string
for
_
,
qv
:=
range
v
.
GetQuasiIdsValues
()
{
qvs
=
append
(
qvs
,
qv
.
String
())
}
fmt
.
Fprintf
(
w
,
" QuasiID values: %s\n"
,
strings
.
Join
(
qvs
,
", "
))
fmt
.
Fprintf
(
w
,
" Class size: %v\n"
,
v
.
GetEquivalenceClassSize
())
for
_
,
sv
:=
range
v
.
GetTopSensitiveValues
()
{
fmt
.
Fprintf
(
w
,
" Sensitive value %v occurs %v times\n"
,
sv
.
GetValue
(),
sv
.
GetCount
())
}
}
}
// Stop listening for more messages.
cancel
()
})
if
err
!=
nil
{
return
fmt
.
Errorf
(
"Recieve: %w"
,
err
)
}
return
nil
}
Java
To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries .
To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .
import
com.google.api.core. SettableApiFuture
;
import
com.google.cloud.dlp.v2. DlpServiceClient
;
import
com.google.cloud.dlp.v2. DlpServiceSettings
;
import
com.google.cloud.pubsub.v1. AckReplyConsumer
;
import
com.google.cloud.pubsub.v1. MessageReceiver
;
import
com.google.cloud.pubsub.v1. Subscriber
;
import
com.google.privacy.dlp.v2. Action
;
import
com.google.privacy.dlp.v2. Action
. PublishToPubSub
;
import
com.google.privacy.dlp.v2. AnalyzeDataSourceRiskDetails
. LDiversityResult
;
import
com.google.privacy.dlp.v2. AnalyzeDataSourceRiskDetails
. LDiversityResult
. LDiversityEquivalenceClass
;
import
com.google.privacy.dlp.v2. AnalyzeDataSourceRiskDetails
. LDiversityResult
. LDiversityHistogramBucket
;
import
com.google.privacy.dlp.v2. BigQueryTable
;
import
com.google.privacy.dlp.v2. CreateDlpJobRequest
;
import
com.google.privacy.dlp.v2. DlpJob
;
import
com.google.privacy.dlp.v2. FieldId
;
import
com.google.privacy.dlp.v2. GetDlpJobRequest
;
import
com.google.privacy.dlp.v2. LocationName
;
import
com.google.privacy.dlp.v2. PrivacyMetric
;
import
com.google.privacy.dlp.v2. PrivacyMetric
. LDiversityConfig
;
import
com.google.privacy.dlp.v2. RiskAnalysisJobConfig
;
import
com.google.privacy.dlp.v2. Value
;
import
com.google.privacy.dlp.v2. ValueFrequency
;
import
com.google.pubsub.v1. ProjectSubscriptionName
;
import
com.google.pubsub.v1. ProjectTopicName
;
import
com.google.pubsub.v1. PubsubMessage
;
import
java.io.IOException
;
import
java.util.Arrays
;
import
java.util.List
;
import
java.util.concurrent.ExecutionException
;
import
java.util.concurrent.TimeUnit
;
import
java.util.concurrent.TimeoutException
;
import
java.util.stream.Collectors
;
import
org.threeten.bp.Duration
;
@SuppressWarnings
(
"checkstyle:AbbreviationAsWordInName"
)
class
RiskAnalysisLDiversity
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
// TODO(developer): Replace these variables before running the sample.
String
projectId
=
"your-project-id"
;
String
datasetId
=
"your-bigquery-dataset-id"
;
String
tableId
=
"your-bigquery-table-id"
;
String
topicId
=
"pub-sub-topic"
;
String
subscriptionId
=
"pub-sub-subscription"
;
calculateLDiversity
(
projectId
,
datasetId
,
tableId
,
topicId
,
subscriptionId
);
}
public
static
void
calculateLDiversity
(
String
projectId
,
String
datasetId
,
String
tableId
,
String
topicId
,
String
subscriptionId
)
throws
ExecutionException
,
InterruptedException
,
IOException
{
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests. After completing all of your requests, call
// the "close" method on the client to safely clean up any remaining background resources.
DlpServiceSettings
.
Builder
dlpServiceSettingsBuilder
=
DlpServiceSettings
.
newBuilder
();
dlpServiceSettingsBuilder
.
getDlpJobSettings
()
.
setRetrySettings
(
dlpServiceSettingsBuilder
.
getDlpJobSettings
()
.
getRetrySettings
()
.
toBuilder
()
.
setTotalTimeout
(
Duration
.
ofSeconds
(
600
))
.
build
());
try
(
DlpServiceClient
dlpServiceClient
=
DlpServiceClient
.
create
(
dlpServiceSettingsBuilder
.
build
()))
{
// Specify the BigQuery table to analyze
BigQueryTable
bigQueryTable
=
BigQueryTable
.
newBuilder
()
.
setProjectId
(
projectId
)
.
setDatasetId
(
datasetId
)
.
setTableId
(
tableId
)
.
build
();
// These values represent the column names of quasi-identifiers to analyze
List<String>
quasiIds
=
Arrays
.
asList
(
"Age"
,
"Mystery"
);
// This value represents the column name to compare the quasi-identifiers against
String
sensitiveAttribute
=
"Name"
;
// Configure the privacy metric for the job
FieldId
sensitiveAttributeField
=
FieldId
.
newBuilder
().
setName
(
sensitiveAttribute
).
build
();
List<FieldId>
quasiIdFields
=
quasiIds
.
stream
()
.
map
(
columnName
-
>
FieldId
.
newBuilder
().
setName
(
columnName
).
build
())
.
collect
(
Collectors
.
toList
());
LDiversityConfig
ldiversityConfig
=
LDiversityConfig
.
newBuilder
()
.
addAllQuasiIds
(
quasiIdFields
)
.
setSensitiveAttribute
(
sensitiveAttributeField
)
.
build
();
PrivacyMetric
privacyMetric
=
PrivacyMetric
.
newBuilder
().
setLDiversityConfig
(
ldiversityConfig
).
build
();
// Create action to publish job status notifications over Google Cloud Pub/
ProjectTopicName
topicName
=
ProjectTopicName
.
of
(
projectId
,
topicId
);
PublishToPubSub
publishToPubSub
=
PublishToPubSub
.
newBuilder
().
setTopic
(
topicName
.
toString
()).
build
();
Action
action
=
Action
.
newBuilder
().
setPubSub
(
publishToPubSub
).
build
();
// Configure the risk analysis job to perform
RiskAnalysisJobConfig
riskAnalysisJobConfig
=
RiskAnalysisJobConfig
.
newBuilder
()
.
setSourceTable
(
bigQueryTable
)
.
setPrivacyMetric
(
privacyMetric
)
.
addActions
(
action
)
.
build
();
// Build the request to be sent by the client
CreateDlpJobRequest
createDlpJobRequest
=
CreateDlpJobRequest
.
newBuilder
()
.
setParent
(
LocationName
.
of
(
projectId
,
"global"
).
toString
())
.
setRiskJob
(
riskAnalysisJobConfig
)
.
build
();
// Send the request to the API using the client
DlpJob
dlpJob
=
dlpServiceClient
.
createDlpJob
(
createDlpJobRequest
);
// Set up a Pub/Sub subscriber to listen on the job completion status
final
SettableApiFuture<Boolean>
done
=
SettableApiFuture
.
create
();
ProjectSubscriptionName
subscriptionName
=
ProjectSubscriptionName
.
of
(
projectId
,
subscriptionId
);
MessageReceiver
messageHandler
=
(
PubsubMessage
pubsubMessage
,
AckReplyConsumer
ackReplyConsumer
)
-
>
{
handleMessage
(
dlpJob
,
done
,
pubsubMessage
,
ackReplyConsumer
);
};
Subscriber
subscriber
=
Subscriber
.
newBuilder
(
subscriptionName
,
messageHandler
).
build
();
subscriber
.
startAsync
();
// Wait for job completion semi-synchronously
// For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
try
{
done
.
get
(
15
,
TimeUnit
.
MINUTES
);
}
catch
(
TimeoutException
e
)
{
System
.
out
.
println
(
"Job was not completed after 15 minutes."
);
return
;
}
finally
{
subscriber
.
stopAsync
();
subscriber
.
awaitTerminated
();
}
// Build a request to get the completed job
GetDlpJobRequest
getDlpJobRequest
=
GetDlpJobRequest
.
newBuilder
().
setName
(
dlpJob
.
getName
()).
build
();
// Retrieve completed job status
DlpJob
completedJob
=
dlpServiceClient
.
getDlpJob
(
getDlpJobRequest
);
System
.
out
.
println
(
"Job status: "
+
completedJob
.
getState
());
System
.
out
.
println
(
"Job name: "
+
dlpJob
.
getName
());
// Get the result and parse through and process the information
LDiversityResult
ldiversityResult
=
completedJob
.
getRiskDetails
().
getLDiversityResult
();
List<LDiversityHistogramBucket>
histogramBucketList
=
ldiversityResult
.
getSensitiveValueFrequencyHistogramBucketsList
();
for
(
LDiversityHistogramBucket
result
:
histogramBucketList
)
{
for
(
LDiversityEquivalenceClass
bucket
:
result
.
getBucketValuesList
())
{
List<String>
quasiIdValues
=
bucket
.
getQuasiIdsValuesList
().
stream
()
.
map
(
Value
::
toString
)
.
collect
(
Collectors
.
toList
());
System
.
out
.
println
(
"\tQuasi-ID values: "
+
String
.
join
(
", "
,
quasiIdValues
));
System
.
out
.
println
(
"\tClass size: "
+
bucket
.
getEquivalenceClassSize
());
for
(
ValueFrequency
valueFrequency
:
bucket
.
getTopSensitiveValuesList
())
{
System
.
out
.
printf
(
"\t\tSensitive value %s occurs %d time(s).\n"
,
valueFrequency
.
getValue
().
toString
(),
valueFrequency
.
getCount
());
}
}
}
}
}
// handleMessage injects the job and settableFuture into the message reciever interface
private
static
void
handleMessage
(
DlpJob
job
,
SettableApiFuture<Boolean>
done
,
PubsubMessage
pubsubMessage
,
AckReplyConsumer
ackReplyConsumer
)
{
String
messageAttribute
=
pubsubMessage
.
getAttributesMap
().
get
(
"DlpJobName"
);
if
(
job
.
getName
().
equals
(
messageAttribute
))
{
done
.
set
(
true
);
ack
ReplyConsumer .
ack
();
}
else
{
ackReplyConsumer
.
nack
();
}
}
}
Node.js
To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries .
To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .
// Import the Google Cloud client libraries
const
DLP
=
require
(
' @google-cloud/dlp
'
);
const
{
PubSub
}
=
require
(
' @google-cloud/pubsub
'
);
// Instantiates clients
const
dlp
=
new
DLP
.
DlpServiceClient
();
const
pubsub
=
new
PubSub
();
// The project ID to run the API call under
// const projectId = 'my-project';
// The project ID the table is stored under
// This may or (for public datasets) may not equal the calling project ID
// const tableProjectId = 'my-project';
// The ID of the dataset to inspect, e.g. 'my_dataset'
// const datasetId = 'my_dataset';
// The ID of the table to inspect, e.g. 'my_table'
// const tableId = 'my_table';
// The name of the Pub/Sub topic to notify once the job completes
// TODO(developer): create a Pub/Sub topic to use for this
// const topicId = 'MY-PUBSUB-TOPIC'
// The name of the Pub/Sub subscription to use when listening for job
// completion notifications
// TODO(developer): create a Pub/Sub subscription to use for this
// const subscriptionId = 'MY-PUBSUB-SUBSCRIPTION'
// The column to measure l-diversity relative to, e.g. 'firstName'
// const sensitiveAttribute = 'name';
// A set of columns that form a composite key ('quasi-identifiers')
// const quasiIds = [{ name: 'age' }, { name: 'city' }];
async
function
lDiversityAnalysis
()
{
const
sourceTable
=
{
projectId
:
tableProjectId
,
datasetId
:
datasetId
,
tableId
:
tableId
,
};
// Construct request for creating a risk analysis job
const
request
=
{
parent
:
`projects/
${
projectId
}
/locations/global`
,
riskJob
:
{
privacyMetric
:
{
lDiversityConfig
:
{
quasiIds
:
quasiIds
,
sensitiveAttribute
:
{
name
:
sensitiveAttribute
,
},
},
},
sourceTable
:
sourceTable
,
actions
:
[
{
pubSub
:
{
topic
:
`projects/
${
projectId
}
/topics/
${
topicId
}
`
,
},
},
],
},
};
// Create helper function for unpacking values
const
getValue
=
obj
=
>
obj
[
Object
.
keys
(
obj
)[
0
]];
// Run risk analysis job
const
[
topicResponse
]
=
await
pubsub
.
topic
(
topicId
).
get
();
const
subscription
=
await
topicResponse
.
subscription
(
subscriptionId
);
const
[
jobsResponse
]
=
await
dlp
.
createDlpJob
(
request
);
const
jobName
=
jobsResponse
.
name
;
console
.
log
(
`Job created. Job name:
${
jobName
}
`
);
// Watch the Pub/Sub topic until the DLP job finishes
await
new
Promise
((
resolve
,
reject
)
=
>
{
const
messageHandler
=
message
=
>
{
if
(
message
.
attributes
&&
message
.
attributes
.
DlpJobName
===
jobName
)
{
message
.
ack
();
subscription
.
removeListener
(
'message'
,
messageHandler
);
subscription
.
removeListener
(
'error'
,
errorHandler
);
resolve
(
jobName
);
}
else
{
message
.
nack
();
}
};
const
errorHandler
=
err
=
>
{
subscription
.
removeListener
(
'message'
,
messageHandler
);
subscription
.
removeListener
(
'error'
,
errorHandler
);
reject
(
err
);
};
subscripti on
.
on
(
'message'
,
messageHandler
);
subscripti on
.
on
(
'error'
,
errorHandler
);
});
setTimeout
(()
=
>
{
console
.
log
(
' Waiting for DLP job to fully complete'
);
},
500
);
const
[
job
]
=
await
dlp
.
getDlpJob
({
name
:
jobName
});
const
histogramBuckets
=
job
.
riskDetails
.
lDiversityResult
.
sensitiveValueFrequencyHistogramBuckets
;
histogramBuckets
.
forEach
((
histogramBucket
,
histogramBucketIdx
)
=
>
{
console
.
log
(
`Bucket
${
histogramBucketIdx
}
:`
);
console
.
log
(
`Bucket size range: [
${
histogramBucket
.
sensitiveValueFrequencyLowerBound
}
,
${
histogramBucket
.
sensitiveValueFrequencyUpperBound
}
]`
);
histogramBucket
.
bucketValues
.
forEach
(
valueBucket
=
>
{
const
quasiIdValues
=
valueBucket
.
quasiIdsValues
.
map
(
getValue
)
.
join
(
', '
);
console
.
log
(
` Quasi-ID values: {
${
quasiIdValues
}
}`
);
console
.
log
(
` Class size:
${
valueBucket
.
equivalenceClassSize
}
`
);
valueBucket
.
topSensitiveValues
.
forEach
(
valueObj
=
>
{
console
.
log
(
` Sensitive value
${
getValue
(
valueObj
.
value
)
}
occurs
${
valueObj
.
count
}
time(s).`
);
});
});
});
}
await
lDiversityAnalysis
();
PHP
To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries .
To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .
use Google\Cloud\Dlp\V2\Action;
use Google\Cloud\Dlp\V2\Action\PublishToPubSub;
use Google\Cloud\Dlp\V2\BigQueryTable;
use Google\Cloud\Dlp\V2\Client\DlpServiceClient;
use Google\Cloud\Dlp\V2\CreateDlpJobRequest;
use Google\Cloud\Dlp\V2\DlpJob\JobState;
use Google\Cloud\Dlp\V2\FieldId;
use Google\Cloud\Dlp\V2\GetDlpJobRequest;
use Google\Cloud\Dlp\V2\PrivacyMetric;
use Google\Cloud\Dlp\V2\PrivacyMetric\LDiversityConfig;
use Google\Cloud\Dlp\V2\RiskAnalysisJobConfig;
use Google\Cloud\PubSub\PubSubClient;
/**
* Computes the l-diversity of a column set in a Google BigQuery table.
*
* @param string $callingProjectId The project ID to run the API call under
* @param string $dataProjectId The project ID containing the target Datastore
* @param string $topicId The name of the Pub/Sub topic to notify once the job completes
* @param string $subscriptionId The name of the Pub/Sub subscription to use when listening for job
* @param string $datasetId The ID of the dataset to inspect
* @param string $tableId The ID of the table to inspect
* @param string $sensitiveAttribute The column to measure l-diversity relative to, e.g. "firstName"
* @param string[] $quasiIdNames Array columns that form a composite key (quasi-identifiers)
*/
function l_diversity(
string $callingProjectId,
string $dataProjectId,
string $topicId,
string $subscriptionId,
string $datasetId,
string $tableId,
string $sensitiveAttribute,
array $quasiIdNames
): void {
// Instantiate a client.
$dlp = new DlpServiceClient();
$pubsub = new PubSubClient();
$topic = $pubsub->topic($topicId);
// Construct risk analysis config
$quasiIds = array_map(
function ($id) {
return (new FieldId())->setName($id);
},
$quasiIdNames
);
$sensitiveField = (new FieldId())
->setName($sensitiveAttribute);
$statsConfig = (new LDiversityConfig())
->setQuasiIds($quasiIds)
->setSensitiveAttribute($sensitiveField);
$privacyMetric = (new PrivacyMetric())
->setLDiversityConfig($statsConfig);
// Construct items to be analyzed
$bigqueryTable = (new BigQueryTable())
->setProjectId($dataProjectId)
->setDatasetId($datasetId)
->setTableId($tableId);
// Construct the action to run when job completes
$pubSubAction = (new PublishToPubSub())
->setTopic($topic->name());
$action = (new Action())
->setPubSub($pubSubAction);
// Construct risk analysis job config to run
$riskJob = (new RiskAnalysisJobConfig())
->setPrivacyMetric($privacyMetric)
->setSourceTable($bigqueryTable)
->setActions([$action]);
// Listen for job notifications via an existing topic/subscription.
$subscription = $topic->subscription($subscriptionId);
// Submit request
$parent = "projects/$callingProjectId/locations/global";
$createDlpJobRequest = (new CreateDlpJobRequest())
->setParent($parent)
->setRiskJob($riskJob);
$job = $dlp->createDlpJob($createDlpJobRequest);
// Poll Pub/Sub using exponential backoff until job finishes
// Consider using an asynchronous execution model such as Cloud Functions
$attempt = 1;
$startTime = time();
do {
foreach ($subscription->pull() as $message) {
if (
isset($message->attributes()['DlpJobName'])
&& $message->attributes()['DlpJobName'] === $job->getName()
) {
$subscription->acknowledge($message);
// Get the updated job. Loop to avoid race condition with DLP API.
do {
$getDlpJobRequest = (new GetDlpJobRequest())
->setName($job->getName());
$job = $dlp->getDlpJob($getDlpJobRequest);
} while ($job->getState() == JobState::RUNNING);
break 2; // break from parent do while
}
}
print('Waiting for job to complete' . PHP_EOL);
// Exponential backoff with max delay of 60 seconds
sleep(min(60, pow(2, ++$attempt)));
} while (time() - $startTime < 600); // 10 minute timeout
// Print finding counts
printf('Job %s status: %s' . PHP_EOL, $job->getName(), JobState::name($job->getState()));
switch ($job->getState()) {
case JobState::DONE:
$histBuckets = $job->getRiskDetails()->getLDiversityResult()->getSensitiveValueFrequencyHistogramBuckets();
foreach ($histBuckets as $bucketIndex => $histBucket) {
// Print bucket stats
printf('Bucket %s:' . PHP_EOL, $bucketIndex);
printf(
' Bucket size range: [%s, %s]' . PHP_EOL,
$histBucket->getSensitiveValueFrequencyLowerBound(),
$histBucket->getSensitiveValueFrequencyUpperBound()
);
// Print bucket values
foreach ($histBucket->getBucketValues() as $percent => $valueBucket) {
printf(
' Class size: %s' . PHP_EOL,
$valueBucket->getEquivalenceClassSize()
);
// Pretty-print quasi-ID values
print(' Quasi-ID values:' . PHP_EOL);
foreach ($valueBucket->getQuasiIdsValues() as $index => $value) {
print(' ' . $value->serializeToJsonString() . PHP_EOL);
}
// Pretty-print sensitive values
$topValues = $valueBucket->getTopSensitiveValues();
foreach ($topValues as $topValue) {
printf(
' Sensitive value %s occurs %s time(s).' . PHP_EOL,
$topValue->getValue()->serializeToJsonString(),
$topValue->getCount()
);
}
}
}
break;
case JobState::FAILED:
printf('Job %s had errors:' . PHP_EOL, $job->getName());
$errors = $job->getErrors();
foreach ($errors as $error) {
var_dump($error->getDetails());
}
break;
case JobState::PENDING:
print('Job has not completed. Consider a longer timeout or an asynchronous execution model' . PHP_EOL);
break;
default:
print('Unexpected job state. Most likely, the job is either running or has not yet started.');
}
}
Python
To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries .
To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .
import
concurrent.futures
from
typing
import
List
import
google.cloud.dlp
from
google.cloud.dlp_v2
import
types
import
google.cloud.pubsub
def
l_diversity_analysis
(
project
:
str
,
table_project_id
:
str
,
dataset_id
:
str
,
table_id
:
str
,
topic_id
:
str
,
subscription_id
:
str
,
sensitive_attribute
:
str
,
quasi_ids
:
List
[
str
],
timeout
:
int
=
300
,
)
-
> None
:
"""Uses the Data Loss Prevention API to compute the l-diversity of a
column set in a Google BigQuery table.
Args:
project: The Google Cloud project id to use as a parent resource.
table_project_id: The Google Cloud project id where the BigQuery table
is stored.
dataset_id: The id of the dataset to inspect.
table_id: The id of the table to inspect.
topic_id: The name of the Pub/Sub topic to notify once the job
completes.
subscription_id: The name of the Pub/Sub subscription to use when
listening for job completion notifications.
sensitive_attribute: The column to measure l-diversity relative to.
quasi_ids: A set of columns that form a composite key.
timeout: The number of seconds to wait for a response from the API.
Returns:
None; the response from the API is printed to the terminal.
"""
# Create helper function for unpacking values
def
get_values
(
obj
:
types
.
Value
)
-
> int
:
return
int
(
obj
.
integer_value
)
# Instantiate a client.
dlp
=
google
.
cloud
.
dlp_v2
.
DlpServiceClient
()
# Convert the project id into a full resource id.
topic
=
google
.
cloud
.
pubsub
.
PublisherClient
.
topic_path
(
project
,
topic_id
)
parent
=
f
"projects/
{
project
}
/locations/global"
# Location info of the BigQuery table.
source_table
=
{
"project_id"
:
table_project_id
,
"dataset_id"
:
dataset_id
,
"table_id"
:
table_id
,
}
# Convert quasi id list to Protobuf type
def
map_fields
(
field
:
str
)
-
> dict
:
return
{
"name"
:
field
}
quasi_ids
=
map
(
map_fields
,
quasi_ids
)
# Tell the API where to send a notification when the job is complete.
actions
=
[{
"pub_sub"
:
{
"topic"
:
topic
}}]
# Configure risk analysis job
# Give the name of the numeric column to compute risk metrics for
risk_job
=
{
"privacy_metric"
:
{
"l_diversity_config"
:
{
"quasi_ids"
:
quasi_ids
,
"sensitive_attribute"
:
{
"name"
:
sensitive_attribute
},
}
},
"source_table"
:
source_table
,
"actions"
:
actions
,
}
# Call API to start risk analysis job
operation
=
dlp
.
create_dlp_job
(
request
=
{
"parent"
:
parent
,
"risk_job"
:
risk_job
})
def
callback
(
message
:
google
.
cloud
.
pubsub_v1
.
subscriber
.
message
.
Message
)
-
> None
:
if
message
.
attributes
[
"DlpJobName"
]
==
operation
.
name
:
# This is the message we're looking for, so acknowledge it.
message
.
ack
()
# Now that the job is done, fetch the results and print them.
job
=
dlp
.
get_dlp_job
(
request
=
{
"name"
:
operation
.
name
})
print
(
f
"Job name:
{
job
.
name
}
"
)
histogram_buckets
=
(
job
.
risk_details
.
l_diversity_result
.
sensitive_value_frequency_histogram_buckets
# noqa: E501
)
# Print bucket stats
for
i
,
bucket
in
enumerate
(
histogram_buckets
):
print
(
f
"Bucket
{
i
}
:"
)
print
(
" Bucket size range: [
{}
,
{}
]"
.
format
(
bucket
.
sensitive_value_frequency_lower_bound
,
bucket
.
sensitive_value_frequency_upper_bound
,
)
)
for
value_bucket
in
bucket
.
bucket_values
:
print
(
" Quasi-ID values:
{}
"
.
format
(
map
(
get_values
,
value_bucket
.
quasi_ids_values
)
)
)
print
(
f
" Class size:
{
value_bucket
.
equivalence_class_size
}
"
)
for
value
in
value_bucket
.
top_sensitive_values
:
print
(
" Sensitive value
{}
occurs
{}
time(s)"
.
format
(
value
.
value
,
value
.
count
)
)
subscription
.
set_result
(
None
)
else
:
# This is not the message we're looking for.
message
.
drop
()
# Create a Pub/Sub client and find the subscription. The subscription is
# expected to already be listening to the topic.
subscriber
=
google
.
cloud
.
pubsub
.
SubscriberClient
()
subscription_path
=
subscriber
.
subscription_path
(
project
,
subscription_id
)
subscription
=
subscribe
r .
subscribe
(
subscription_path
,
callback
)
try
:
subscription
.
result
(
timeout
=
timeout
)
except
concurrent
.
futures
.
TimeoutError
:
print
(
"No event received before the timeout. Please verify that the "
"subscription provided is subscribed to the topic provided."
)
subscription
.
close
()
What's next
To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser .

