This is my first datframe:
s_cmdb_id = c("BEAT0001", "BEAT0002", "BEAT0003","BEAT0004", "BEAT0005", "BEAT0006", "ROLL0001", "ROLL0002", "ROLL0003","ROLL0004", "ROLL0005", "ROLL0006", "ROLL0007", "EAGL0001", "EAGL0002","EAGL0003", "EAGL0004", "EAGL0005")
s_name = c("JOHNL", "PAULM","GEORGEH", "RINGOS", "BRIANE", "GEORGEM", "MICKG", "KEITHR","CHARLIEW", "RONW", "BILLW", "BRIANJ", "MICKT", "GLENNF", "DONH", "TIMB", "JOEW", "DONF")
s_dns = c("johnl.winston.lennon", "paulm.sir.james.paul.mccartney","georgeh.george.harrison", "ringos.sir.richard.starkey", "briane.brian.samuel.epstein","georgem.sir.george.henry.martin", "mickj.sir.michael.philip.jagger","keithr.keith.richards", "charliew.charles.robert.watts", "ronw.ronald.david.wood","billw.william.george.wyman", "brianj.lewis.brian.hokin.jones","mickt.michael.kevin.taylor", "glennf.glenn.lewis.frey", "donh.donald.hugh.henley","timb.timothy.bruce.schmit", "joew.joseph.fidler.walsh", "donf.donald.william.felder")
s_company = c("Beatles Company", "Beatles Company", "Beatles Company","Beatles Company", "Beatles Company", "Beatles Company", "Rolling Company","Rolling Company", "Rolling Company", "Rolling Company", "Rolling Company","Rolling Company", "Rolling Company", "Eagles Company", "Eagles Company","Eagles Company", "Eagles Company", "Eagles Company")
s_environ = c("Production","Production", "Production", "Test", "Contingency", "Development","Production", "Certification", "Pre-Production", "Development","Test", "Production", "Test", "Production", "Production", "Development","Development", "Development")
s_ip_address = c("160.61.22.110","160.61.22.111", "160.61.22.112", "160.61.22.1", "160.61.20.13","150.23.33.130", "150.23.33.131", "150.23.33.132", NA, "150.23.33.134","150.23.33.135", NA, NA, "120.220.210.222", "120.220.210.223","120.220.210.224", "120.220.210.225", "120.220.210.226")
s_ip_other = c("16.30.12.110 14.22.33.1","16.30.12.111", "16.30.12.112", "16.30.12.1", "16.30.12.13",NA, "14.22.33.131", "14.22.33.132", "14.22.33.133 13.21.32.1 13.21.32.2","14.22.33.134", "14.22.33.135", "14.22.33.136 13.20.30.1 13.20.30.2","14.22.33.137 13.19.29.4 13.19.29.44", NA, NA, "110.11.2.3","110.11.2.4", "110.11.2.5")
s_status = c("Lawsuit", "Active","Available", "Active", "Damaged", "Active", "Active", "Active","End of Life", "Lawsuit", "Lost", "Scrapped", "Pending Disposal","Active", "Active", "Pending Wipe", "Wiped", "Wiped")
df_cmdb <- data.frame(s_cmdb_id,s_name,s_dns,s_company,s_environ,s_ip_address,s_ip_other,s_status)
> df_cmdb
s_cmdb_id s_name s_dns s_company s_environ s_ip_address s_ip_other s_status
1 BEAT0001 JOHNL johnl.winston.lennon Beatles Company Production 160.61.22.110 16.30.12.110 14.22.33.1 Lawsuit
2 BEAT0002 PAULM paulm.sir.james.paul.mccartney Beatles Company Production 160.61.22.111 16.30.12.111 Active
3 BEAT0003 GEORGEH georgeh.george.harrison Beatles Company Production 160.61.22.112 16.30.12.112 Available
4 BEAT0004 RINGOS ringos.sir.richard.starkey Beatles Company Test 160.61.22.1 16.30.12.1 Active
5 BEAT0005 BRIANE briane.brian.samuel.epstein Beatles Company Contingency 160.61.20.13 16.30.12.13 Damaged
6 BEAT0006 GEORGEM georgem.sir.george.henry.martin Beatles Company Development 150.23.33.130 <NA> Active
7 ROLL0001 MICKG mickj.sir.michael.philip.jagger Rolling Company Production 150.23.33.131 14.22.33.131 Active
8 ROLL0002 KEITHR keithr.keith.richards Rolling Company Certification 150.23.33.132 14.22.33.132 Active
9 ROLL0003 CHARLIEW charliew.charles.robert.watts Rolling Company Pre-Production <NA> 14.22.33.133 13.21.32.1 13.21.32.2 End of Life
10 ROLL0004 RONW ronw.ronald.david.wood Rolling Company Development 150.23.33.134 14.22.33.134 Lawsuit
11 ROLL0005 BILLW billw.william.george.wyman Rolling Company Test 150.23.33.135 14.22.33.135 Lost
12 ROLL0006 BRIANJ brianj.lewis.brian.hokin.jones Rolling Company Production <NA> 14.22.33.136 13.20.30.1 13.20.30.2 Scrapped
13 ROLL0007 MICKT mickt.michael.kevin.taylor Rolling Company Test <NA> 14.22.33.137 13.19.29.4 13.19.29.44 Pending Disposal
14 EAGL0001 GLENNF glennf.glenn.lewis.frey Eagles Company Production 120.220.210.222 <NA> Active
15 EAGL0002 DONH donh.donald.hugh.henley Eagles Company Production 120.220.210.223 <NA> Active
16 EAGL0003 TIMB timb.timothy.bruce.schmit Eagles Company Development 120.220.210.224 110.11.2.3 Pending Wipe
17 EAGL0004 JOEW joew.joseph.fidler.walsh Eagles Company Development 120.220.210.225 110.11.2.4 Wiped
18 EAGL0005 DONF donf.donald.william.felder Eagles Company Development 120.220.210.226
110.11.2.5 Wiped
This is my second dataframe:
s_ip = c("14.22.33.1", "14.22.33.1", "16.30.12.111","16.30.12.111", "150.23.33.132", "13.21.32.2", "13.19.29.4")
s_dns = c("johnl.winston.lennon.dev", "johnl.winston.lennon.dev","paulm.sir.james.paul.mccartney", NA, "keithr.keith.richards","charliew.charles.robert.watts.cert", NA)
s_qid = c("38169","38170", "38601", "38603", "38909", "38655", "42366")
n_port = c(56134L,56134L, 56134L, 221L, 22L, 5634L, 9090L)
s_protocol = c("tcp","tcp", "tcp", "udp", "tcp", "tcp", "tcp")
df_scan <- data.frame(s_ip,s_dns,s_qid,n_port,s_protocol)
> df_scan
s_ip s_dns s_qid n_port s_protocol
1 14.22.33.1 johnl.winston.lennon.dev 38169 56134 tcp
2 14.22.33.1 johnl.winston.lennon.dev 38170 56134 tcp
3 16.30.12.111 paulm.sir.james.paul.mccartney 38601 56134 tcp
4 16.30.12.111 <NA> 38603 221 udp
5 150.23.33.132 keithr.keith.richards 38909 22 tcp
6 13.21.32.2 charliew.charles.robert.watts.cert 38655 5634 tcp
7 13.19.29.4 <NA> 42366 9090 tcp
And this is my final dataframe:
s_ip = c("14.22.33.1", "14.22.33.1", "16.30.12.111","16.30.12.111", "150.23.33.132", "13.21.32.2", "13.19.29.4")
s_dns = c("johnl.winston.lennon.dev", "johnl.winston.lennon.dev","paulm.sir.james.paul.mccartney", NA, "keithr.keith.richards","charliew.charles.robert.watts.cert", NA)
s_qid = c("38169","38170", "38601", "38603", "38909", "38655", "42366")
n_port = c(56134L,56134L, 56134L, 221L, 22L, 5634L, 9090L)
s_protocol = c("tcp","tcp", "tcp", "udp", "tcp", "tcp", "tcp")
s_cmdb_id = c("BEAT0001","BEAT0001", "BEAT0002", "BEAT0002", "ROLL0002", "ROLL0003","ROLL0007")
s_name = c("JOHNL", "JOHNL", "PAULM", "PAULM","KEITHR", "CHARLIEW", "MICKT")
s_company = c("Beatles Company","Beatles Company", "Beatles Company", "Beatles Company","Rolling Company", "Rolling Company", "Rolling Company")
s_status = c("Lawsuit", "Lawsuit", "Active", "Active", "Active","End of Life", "Pending Disposal")
df_final <- data.frame(s_ip,s_dns,s_qid,n_port,s_protocol,s_cmdb_id,s_name,s_company,s_status)
> df_final
s_ip s_dns s_qid n_port s_protocol s_cmdb_id s_name s_company s_status
1 14.22.33.1 johnl.winston.lennon.dev 38169 56134 tcp BEAT0001 JOHNL Beatles Company Lawsuit
2 14.22.33.1 johnl.winston.lennon.dev 38170 56134 tcp BEAT0001 JOHNL Beatles Company Lawsuit
3 16.30.12.111 paulm.sir.james.paul.mccartney 38601 56134 tcp BEAT0002 PAULM Beatles Company Active
4 16.30.12.111 <NA> 38603 221 udp BEAT0002 PAULM Beatles Company Active
5 150.23.33.132 keithr.keith.richards 38909 22 tcp ROLL0002 KEITHR Rolling Company Active
6 13.21.32.2 charliew.charles.robert.watts.cert 38655 5634 tcp ROLL0003 CHARLIEW Rolling Company End of Life
7 13.19.29.4 <NA> 42366 9090 tcp ROLL0007 MICKT Rolling Company Pending Disposal
- As you can see, df_final is a left_join of df_scan and df_cmdb.
- There are 4 new fields added, these are related by the s_ip field (df_scan) <—> s_ip_address+s_ip_other fields (df_cmdb).
- The ip addresses in s_ip_address and s_ip_other fields can be separated by one or more spaces.
- Consider that the ip addresses could be similiar (but no equal) ex. ips: 14.22.33.1 is similar to 14.22.33.131
- Columns s_ip_address and s_ip_other could have NA values.
- What is the best way to build this relation as in df_final dataframe?
- Please consider that df_scan contains one million records and df_cmdb contains 100k records.
- Thanks in advance.
- Im a beginner in R language.