Commit 7b217d1b authored by Szabolcs Gyurko's avatar Szabolcs Gyurko
Browse files

Added customisable random functions, refactored the pseudoRandomiser class for...

Added customisable random functions, refactored the pseudoRandomiser class for this, few optimisations added
parent 81887534
......@@ -35,34 +35,37 @@ type AnonymiserDatabases struct {
Drop *bool `yaml:"drop"`
}
type RandomiserRule struct {
Function string `yaml:"function"`
File *string `yaml:"file"`
Expression *string `yaml:"expression"`
RowLevel *bool `yaml:"rowLevel"`
Values *[]string `yaml:"values"`
}
type AnonymiserRules struct {
RandomSetup []RandomiserRule `yaml:"randomSetup"`
Databases map[string]AnonymiserDatabases `yaml:"databases"`
}
// Regex objects for pre-compilation
type RegexpCompile struct {
old *regexp.Regexp
new *regexp.Regexp
randomint *regexp.Regexp
first *regexp.Regexp
middle *regexp.Regexp
last *regexp.Regexp
postcode *regexp.Regexp
email *regexp.Regexp
uuid *regexp.Regexp
field *regexp.Regexp
old *regexp.Regexp
new *regexp.Regexp
randomint *regexp.Regexp
uuid *regexp.Regexp
field *regexp.Regexp
randomiserSetup []*regexp.Regexp
}
type GenericAnonymiser struct {
first string
middle string
last string
uuid string
deferred []string
regexp RegexpCompile
rules AnonymiserRules
randomiser Randomiser
uniqueValues map[string]bool
uuid string
deferred []string
regexp RegexpCompile
rules AnonymiserRules
randomiser Randomiser
uniqueValues map[string]bool
rowLevelRandoms map[string]string
}
// Simple function to read and parse a YAML file
......@@ -90,13 +93,16 @@ func NewGenericAnonymiser(dir string, randomiser Randomiser) (*GenericAnonymiser
if err := r.readYaml(dir+string(os.PathSeparator)+"rules.yaml", &r.rules); err != nil {
return nil, err
}
if err := r.randomiser.Init(r.rules.RandomSetup); err != nil {
return nil, err
}
r.regexp.old = regexp.MustCompile(":OLD")
r.regexp.new = regexp.MustCompile(":NEW")
r.regexp.first = regexp.MustCompile("randomFirst\\(\\)")
r.regexp.middle = regexp.MustCompile("randomMiddle\\(\\)")
r.regexp.last = regexp.MustCompile("randomLast\\(\\)")
r.regexp.email = regexp.MustCompile("randomEmail\\(\\)")
r.regexp.postcode = regexp.MustCompile("randomPostcode\\(\\)")
// Pre-compile function name regexes
for _, v := range r.rules.RandomSetup {
r.regexp.randomiserSetup = append(r.regexp.randomiserSetup, regexp.MustCompile(v.Function+"\\(\\)"))
}
r.regexp.randomint = regexp.MustCompile("randomInt\\(\\)")
r.regexp.uuid = regexp.MustCompile("randomUUID\\(\\)")
r.regexp.field = regexp.MustCompile(":[a-zA-Z0-9_]+")
......@@ -122,22 +128,41 @@ func (r *GenericAnonymiser) generateRandoms(uuid string) {
if uuid != r.uuid {
r.uuid = uuid
r.first = r.randomiser.First()
r.middle = r.randomiser.Middle()
r.last = r.randomiser.Last()
// Generate row level randoms
r.rowLevelRandoms = make(map[string]string)
for _, rule := range(r.rules.RandomSetup) {
if rule.RowLevel != nil && *rule.RowLevel {
r.rowLevelRandoms[rule.Function] = r.randomiser.Randomise(rule.Function)
}
}
}
}
// Parses known macros and do the substitution
func (r *GenericAnonymiser) parseRandoms(data string, row AnonymiserRow) string {
result := r.regexp.first.ReplaceAllString(data, r.first)
result = r.regexp.middle.ReplaceAllString(result, r.middle)
result = r.regexp.last.ReplaceAllString(result, r.last)
result = r.regexp.email.ReplaceAllString(result, strings.ToLower(r.first)+"."+strings.ToLower(r.middle)+"@"+strings.ToLower(r.last)+".co.uk")
result = r.regexp.postcode.ReplaceAllString(result, r.randomiser.Postcode())
result := data
// Expand expressions first (if any)
for _, rule := range(r.rules.RandomSetup) {
if rule.Expression != nil {
result = strings.ReplaceAll(result, rule.Function + "()", *rule.Expression)
}
}
// Process row level randoms (these are pre-generated for every row and we don't want them to change within a row)
for k, v := range(r.rowLevelRandoms) {
result = strings.ReplaceAll(result, k + "()", v)
}
// Built-in functions
result = r.regexp.randomint.ReplaceAllString(result, fmt.Sprintf("%v", rand.Int()))
result = r.regexp.uuid.ReplaceAllString(result, uuid.New().String())
// Now process all the custom functions
for _, regex := range(r.regexp.randomiserSetup) {
result = regex.ReplaceAllString(result, r.randomiser.Randomise(strings.TrimSuffix(regex.String(), "\\(\\)")))
}
// Look for field back references
if matches := r.regexp.field.FindAllString(result, -1); matches != nil {
// Process them one by one
......
......@@ -13,11 +13,8 @@ type Anonymiser interface {
// Randomiser interface
type Randomiser interface {
First() string
Last() string
Middle() string
Tld() string
Postcode() string
Init([]RandomiserRule) error
Randomise(string) string
}
type AnonymiserRow struct {
......
......@@ -11,6 +11,8 @@ import (
// Internal structures
type PseudoRandomiser struct {
directory string
rules map[string][]string
firstNames []string
middleNames []string
lastNames []string
......@@ -34,41 +36,33 @@ func (*PseudoRandomiser) readJson(file string, output interface{}) error {
// Initialize internal variables
func NewPseudoRandomiser(dir string) (*PseudoRandomiser, error) {
r := new(PseudoRandomiser)
if err := r.readJson(dir + string(os.PathSeparator) + "first-names.json", &r.firstNames); err != nil {
return nil, err
}
if err := r.readJson(dir + string(os.PathSeparator) + "middle-names.json", &r.middleNames); err != nil {
return nil, err
}
if err := r.readJson(dir + string(os.PathSeparator) + "last-names.json", &r.lastNames); err != nil {
return nil, err
}
if err := r.readJson(dir + string(os.PathSeparator) + "tld.json", &r.tlds); err != nil {
return nil, err
}
if err := r.readJson(dir + string(os.PathSeparator) + "postcodes.json", &r.postCodes); err != nil {
return nil, err
}
r.rules = make(map[string][]string)
r.directory = dir
return r, nil
}
func (r *PseudoRandomiser) First() string {
return r.firstNames[rand.Intn(len(r.firstNames))]
}
func (r *PseudoRandomiser) Init(rules []RandomiserRule) error {
for _, rule := range rules {
if rule.File != nil {
var items []string
if err := r.readJson(r.directory + string(os.PathSeparator) + *rule.File, &items); err != nil {
return err
}
r.rules[rule.Function] = items
} else if rule.Values != nil {
r.rules[rule.Function] = *rule.Values
}
}
func (r *PseudoRandomiser) Middle() string {
return r.middleNames[rand.Intn(len(r.middleNames))]
return nil
}
func (r *PseudoRandomiser) Last() string {
return r.lastNames[rand.Intn(len(r.lastNames))]
}
func (r *PseudoRandomiser) Randomise(function string) string {
if items, ok := r.rules[function]; ok {
return items[rand.Intn(len(items))]
}
func (r *PseudoRandomiser) Tld() string {
return r.tlds[rand.Intn(len(r.tlds))]
return ""
}
func (r *PseudoRandomiser) Postcode() string {
return r.postCodes[rand.Intn(len(r.postCodes))]
}
---
# Setup for random functions
#
# Besides the functions declared here, randomInt(), randomUUID() are always present as built-in functions
#
randomSetup:
# If "rowLevel" is present and set to "true" the function will be only calculated once for a row and every appearance
# of it will be yielding the same value. Useful for names to be the same within say firstName and email column.
- function: randomFirst
file: first-names.json
rowLevel: true
- function: randomMiddle
file: middle-names.json
rowLevel: true
- function: randomLast
file: last-names.json
rowLevel: true
- function: randomTld
# file: tld.json
# An example of supplying a fixed list within the config.
values: ["co.uk"]
- function: randomPostcode
file: postcodes.json
- function: randomEmail
expression: "randomFirst().randomMiddle()@randomLast().randomTld()"
databases:
vw_auth:
tables:
......@@ -267,8 +293,8 @@ databases:
tables:
short_url:
drop: true
# tax_p11d:
# rules:
# - field: tax_year
# replace: "randomInt() - :id - :percent"
# defer: UPDATE tax_p11d SET percent=:id where id=:id;
tax_p11d:
rules:
- field: tax_year
replace: "randomInt() - :id - :percent - randomFirst() - randomEmail() - randomPostcode()"
defer: UPDATE tax_p11d SET percent=:id where id=:id;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment