Previously, the analyzer assumed that two consecutive flows would be a pair. This proved unreliable, therefore a new comparison algorithm is ussed. It utilizes the IP addresses and the 'first_switched' parameter to identify two flows of the same connection. More improvements can be done, especially filtering and in the identification of the initiating peer. Tests still fail, have to be adapted to the new dicts and gzip.
352 lines
12 KiB
Python
352 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
|
|
"""
|
|
Netflow V9 collector and parser implementation in Python 3.
|
|
Created for learning purposes and unsatisfying alternatives.
|
|
|
|
Reference: https://www.cisco.com/en/US/technologies/tk648/tk362/technologies_white_paper09186a00800a3db9.html
|
|
|
|
This script is specifically implemented in combination with softflowd.
|
|
See https://github.com/djmdjm/softflowd
|
|
|
|
Copyright 2017, 2018 Dominik Pataky <dev@bitkeks.eu>
|
|
Licensed under MIT License. See LICENSE.
|
|
"""
|
|
|
|
import ipaddress
|
|
import struct
|
|
|
|
|
|
FIELD_TYPES = {
|
|
0: 'UNKNOWN_FIELD_TYPE', # fallback for unknown field types
|
|
|
|
# Cisco specs for NetFlow v9
|
|
# https://tools.ietf.org/html/rfc3954
|
|
# https://www.cisco.com/en/US/technologies/tk648/tk362/technologies_white_paper09186a00800a3db9.html
|
|
1: 'IN_BYTES',
|
|
2: 'IN_PKTS',
|
|
3: 'FLOWS',
|
|
4: 'PROTOCOL',
|
|
5: 'SRC_TOS',
|
|
6: 'TCP_FLAGS',
|
|
7: 'L4_SRC_PORT',
|
|
8: 'IPV4_SRC_ADDR',
|
|
9: 'SRC_MASK',
|
|
10: 'INPUT_SNMP',
|
|
11: 'L4_DST_PORT',
|
|
12: 'IPV4_DST_ADDR',
|
|
13: 'DST_MASK',
|
|
14: 'OUTPUT_SNMP',
|
|
15: 'IPV4_NEXT_HOP',
|
|
16: 'SRC_AS',
|
|
17: 'DST_AS',
|
|
18: 'BGP_IPV4_NEXT_HOP',
|
|
19: 'MUL_DST_PKTS',
|
|
20: 'MUL_DST_BYTES',
|
|
21: 'LAST_SWITCHED',
|
|
22: 'FIRST_SWITCHED',
|
|
23: 'OUT_BYTES',
|
|
24: 'OUT_PKTS',
|
|
25: 'MIN_PKT_LNGTH',
|
|
26: 'MAX_PKT_LNGTH',
|
|
27: 'IPV6_SRC_ADDR',
|
|
28: 'IPV6_DST_ADDR',
|
|
29: 'IPV6_SRC_MASK',
|
|
30: 'IPV6_DST_MASK',
|
|
31: 'IPV6_FLOW_LABEL',
|
|
32: 'ICMP_TYPE',
|
|
33: 'MUL_IGMP_TYPE',
|
|
34: 'SAMPLING_INTERVAL',
|
|
35: 'SAMPLING_ALGORITHM',
|
|
36: 'FLOW_ACTIVE_TIMEOUT',
|
|
37: 'FLOW_INACTIVE_TIMEOUT',
|
|
38: 'ENGINE_TYPE',
|
|
39: 'ENGINE_ID',
|
|
40: 'TOTAL_BYTES_EXP',
|
|
41: 'TOTAL_PKTS_EXP',
|
|
42: 'TOTAL_FLOWS_EXP',
|
|
# 43 vendor proprietary
|
|
44: 'IPV4_SRC_PREFIX',
|
|
45: 'IPV4_DST_PREFIX',
|
|
46: 'MPLS_TOP_LABEL_TYPE',
|
|
47: 'MPLS_TOP_LABEL_IP_ADDR',
|
|
48: 'FLOW_SAMPLER_ID',
|
|
49: 'FLOW_SAMPLER_MODE',
|
|
50: 'NTERVAL',
|
|
# 51 vendor proprietary
|
|
52: 'MIN_TTL',
|
|
53: 'MAX_TTL',
|
|
54: 'IPV4_IDENT',
|
|
55: 'DST_TOS',
|
|
56: 'IN_SRC_MAC',
|
|
57: 'OUT_DST_MAC',
|
|
58: 'SRC_VLAN',
|
|
59: 'DST_VLAN',
|
|
60: 'IP_PROTOCOL_VERSION',
|
|
61: 'DIRECTION',
|
|
62: 'IPV6_NEXT_HOP',
|
|
63: 'BPG_IPV6_NEXT_HOP',
|
|
64: 'IPV6_OPTION_HEADERS',
|
|
# 65-69 vendor proprietary
|
|
70: 'MPLS_LABEL_1',
|
|
71: 'MPLS_LABEL_2',
|
|
72: 'MPLS_LABEL_3',
|
|
73: 'MPLS_LABEL_4',
|
|
74: 'MPLS_LABEL_5',
|
|
75: 'MPLS_LABEL_6',
|
|
76: 'MPLS_LABEL_7',
|
|
77: 'MPLS_LABEL_8',
|
|
78: 'MPLS_LABEL_9',
|
|
79: 'MPLS_LABEL_10',
|
|
80: 'IN_DST_MAC',
|
|
81: 'OUT_SRC_MAC',
|
|
82: 'IF_NAME',
|
|
83: 'IF_DESC',
|
|
84: 'SAMPLER_NAME',
|
|
85: 'IN_PERMANENT_BYTES',
|
|
86: 'IN_PERMANENT_PKTS',
|
|
# 87 vendor property
|
|
88: 'FRAGMENT_OFFSET',
|
|
89: 'FORWARDING_STATUS',
|
|
90: 'MPLS_PAL_RD',
|
|
91: 'MPLS_PREFIX_LEN', # Number of consecutive bits in the MPLS prefix length.
|
|
92: 'SRC_TRAFFIC_INDEX', # BGP Policy Accounting Source Traffic Index
|
|
93: 'DST_TRAFFIC_INDEX', # BGP Policy Accounting Destination Traffic Index
|
|
94: 'APPLICATION_DESCRIPTION', # Application description
|
|
95: 'APPLICATION_TAG', # 8 bits of engine ID, followed by n bits of classification
|
|
96: 'APPLICATION_NAME', # Name associated with a classification
|
|
98: 'postipDiffServCodePoint', # The value of a Differentiated Services Code Point (DSCP) encoded in the Differentiated Services Field, after modification
|
|
99: 'replication_factor', # Multicast replication factor
|
|
100: 'DEPRECATED', # DEPRECATED
|
|
102: 'layer2packetSectionOffset', # Layer 2 packet section offset. Potentially a generic offset
|
|
103: 'layer2packetSectionSize', # Layer 2 packet section size. Potentially a generic size
|
|
104: 'layer2packetSectionData', # Layer 2 packet section data
|
|
# 105-127 reserved for future use by Cisco
|
|
|
|
# ASA extensions
|
|
# https://www.cisco.com/c/en/us/td/docs/security/asa/special/netflow/guide/asa_netflow.html
|
|
148: 'NF_F_CONN_ID', # An identifier of a unique flow for the device
|
|
176: 'NF_F_ICMP_TYPE', # ICMP type value
|
|
177: 'NF_F_ICMP_CODE', # ICMP code value
|
|
178: 'NF_F_ICMP_TYPE_IPV6', # ICMP IPv6 type value
|
|
179: 'NF_F_ICMP_CODE_IPV6', # ICMP IPv6 code value
|
|
225: 'NF_F_XLATE_SRC_ADDR_IPV4', # Post NAT Source IPv4 Address
|
|
226: 'NF_F_XLATE_DST_ADDR_IPV4', # Post NAT Destination IPv4 Address
|
|
227: 'NF_F_XLATE_SRC_PORT', # Post NATT Source Transport Port
|
|
228: 'NF_F_XLATE_DST_PORT', # Post NATT Destination Transport Port
|
|
281: 'NF_F_XLATE_SRC_ADDR_IPV6', # Post NAT Source IPv6 Address
|
|
282: 'NF_F_XLATE_DST_ADDR_IPV6', # Post NAT Destination IPv6 Address
|
|
233: 'NF_F_FW_EVENT', # High-level event code
|
|
33002: 'NF_F_FW_EXT_EVENT', # Extended event code
|
|
323: 'NF_F_EVENT_TIME_MSEC', # The time that the event occurred, which comes from IPFIX
|
|
152: 'NF_F_FLOW_CREATE_TIME_MSEC',
|
|
231: 'NF_F_FWD_FLOW_DELTA_BYTES', # The delta number of bytes from source to destination
|
|
232: 'NF_F_REV_FLOW_DELTA_BYTES', # The delta number of bytes from destination to source
|
|
33000: 'NF_F_INGRESS_ACL_ID', # The input ACL that permitted or denied the flow
|
|
33001: 'NF_F_EGRESS_ACL_ID', # The output ACL that permitted or denied a flow
|
|
40000: 'NF_F_USERNAME', # AAA username
|
|
|
|
# PaloAlto PAN-OS 8.0
|
|
# https://www.paloaltonetworks.com/documentation/80/pan-os/pan-os/monitoring/netflow-monitoring/netflow-templates
|
|
346: 'PANOS_privateEnterpriseNumber',
|
|
56701: 'PANOS_APPID',
|
|
56702: 'PANOS_USERID'
|
|
}
|
|
|
|
|
|
class TemplateNotRecognized(KeyError):
|
|
pass
|
|
|
|
|
|
class DataRecord:
|
|
"""This is a 'flow' as we want it from our source. What it contains is
|
|
variable in NetFlow V9, so to work with the data you have to analyze the
|
|
data dict keys (which are integers and can be mapped with the FIELD_TYPES
|
|
dict).
|
|
|
|
Should hold a 'data' dict with keys=field_type (integer) and value (in bytes).
|
|
"""
|
|
def __init__(self):
|
|
self.data = {}
|
|
|
|
def __repr__(self):
|
|
return "<DataRecord with data: {}>".format(self.data)
|
|
|
|
|
|
class DataFlowSet:
|
|
"""Holds one or multiple DataRecord which are all defined after the same
|
|
template. This template is referenced in the field 'flowset_id' of this
|
|
DataFlowSet and must not be zero.
|
|
"""
|
|
def __init__(self, data, templates):
|
|
pack = struct.unpack('!HH', data[:4])
|
|
|
|
self.template_id = pack[0] # flowset_id is reference to a template_id
|
|
self.length = pack[1]
|
|
self.flows = []
|
|
|
|
offset = 4
|
|
|
|
if self.template_id not in templates:
|
|
raise TemplateNotRecognized
|
|
|
|
template = templates[self.template_id]
|
|
|
|
# As the field lengths are variable V9 has padding to next 32 Bit
|
|
padding_size = 4 - (self.length % 4) # 4 Byte
|
|
|
|
while offset <= (self.length - padding_size):
|
|
new_record = DataRecord()
|
|
|
|
for field in template.fields:
|
|
flen = field.field_length
|
|
fkey = FIELD_TYPES[field.field_type]
|
|
|
|
# The length of the value byte slice is defined in the template
|
|
dataslice = data[offset:offset+flen]
|
|
|
|
# Better solution than struct.unpack with variable field length
|
|
fdata = 0
|
|
for idx, byte in enumerate(reversed(bytearray(dataslice))):
|
|
fdata += byte << (idx * 8)
|
|
|
|
# Special handling of IP addresses to convert integers to strings to not lose precision in dump
|
|
if fkey in ["IPV4_SRC_ADDR", "IPV4_DST_ADDR", "IPV6_SRC_ADDR", "IPV6_DST_ADDR"]:
|
|
try:
|
|
ip = ipaddress.ip_address(fdata)
|
|
except ValueError:
|
|
print("IP address could not be parsed: {}".format(fdata))
|
|
continue
|
|
new_record.data[fkey] = ip.compressed
|
|
else:
|
|
new_record.data[fkey] = fdata
|
|
|
|
offset += flen
|
|
|
|
self.flows.append(new_record)
|
|
|
|
def __repr__(self):
|
|
return "<DataFlowSet with template {} of length {} holding {} flows>"\
|
|
.format(self.template_id, self.length, len(self.flows))
|
|
|
|
|
|
class TemplateField:
|
|
"""A field with type identifier and length."""
|
|
def __init__(self, field_type, field_length):
|
|
self.field_type = field_type # integer
|
|
self.field_length = field_length # bytes
|
|
|
|
def __repr__(self):
|
|
return "<TemplateField type {}:{}, length {}>".format(
|
|
self.field_type, FIELD_TYPES[self.field_type], self.field_length)
|
|
|
|
|
|
class TemplateRecord:
|
|
"""A template record contained in a TemplateFlowSet."""
|
|
def __init__(self, template_id, field_count, fields):
|
|
self.template_id = template_id
|
|
self.field_count = field_count
|
|
self.fields = fields
|
|
|
|
def __repr__(self):
|
|
return "<TemplateRecord {} with {} fields: {}>".format(
|
|
self.template_id, self.field_count,
|
|
' '.join([FIELD_TYPES[field.field_type] for field in self.fields]))
|
|
|
|
|
|
class TemplateFlowSet:
|
|
"""A template flowset, which holds an id that is used by data flowsets to
|
|
reference back to the template. The template then has fields which hold
|
|
identifiers of data types (eg "IP_SRC_ADDR", "PKTS"..). This way the flow
|
|
sender can dynamically put together data flowsets.
|
|
"""
|
|
def __init__(self, data):
|
|
pack = struct.unpack('!HH', data[:4])
|
|
self.flowset_id = pack[0]
|
|
self.length = pack[1] # total length including this header in bytes
|
|
self.templates = {}
|
|
|
|
offset = 4 # Skip header
|
|
|
|
# Iterate through all template records in this template flowset
|
|
while offset < self.length:
|
|
pack = struct.unpack('!HH', data[offset:offset+4])
|
|
template_id = pack[0]
|
|
field_count = pack[1]
|
|
|
|
fields = []
|
|
for field in range(field_count):
|
|
# Get all fields of this template
|
|
offset += 4
|
|
field_type, field_length = struct.unpack('!HH', data[offset:offset+4])
|
|
if field_type not in FIELD_TYPES:
|
|
field_type = 0 # Set field_type to UNKNOWN_FIELD_TYPE as fallback
|
|
field = TemplateField(field_type, field_length)
|
|
fields.append(field)
|
|
|
|
# Create a template object with all collected data
|
|
template = TemplateRecord(template_id, field_count, fields)
|
|
|
|
# Append the new template to the global templates list
|
|
self.templates[template.template_id] = template
|
|
|
|
# Set offset to next template_id field
|
|
offset += 4
|
|
|
|
def __repr__(self):
|
|
return "<TemplateFlowSet with id {} of length {} containing templates: {}>"\
|
|
.format(self.flowset_id, self.length, self.templates.keys())
|
|
|
|
|
|
class Header:
|
|
"""The header of the V9ExportPacket"""
|
|
|
|
length = 20
|
|
|
|
def __init__(self, data):
|
|
pack = struct.unpack('!HHIIII', data[:self.length])
|
|
|
|
self.version = pack[0]
|
|
self.count = pack[1] # not sure if correct. softflowd: no of flows
|
|
self.uptime = pack[2]
|
|
self.timestamp = pack[3]
|
|
self.sequence = pack[4]
|
|
self.source_id = pack[5]
|
|
|
|
|
|
class V9ExportPacket:
|
|
"""The flow record holds the header and all template and data flowsets."""
|
|
|
|
def __init__(self, data, templates):
|
|
self.header = Header(data)
|
|
self.templates = templates
|
|
self._new_templates = False
|
|
self.flows = []
|
|
|
|
offset = self.header.length
|
|
while offset != len(data):
|
|
flowset_id = struct.unpack('!H', data[offset:offset+2])[0]
|
|
if flowset_id == 0: # TemplateFlowSet always have id 0
|
|
tfs = TemplateFlowSet(data[offset:])
|
|
# Check for any new/changed templates
|
|
if not self._new_templates:
|
|
for id_, template in tfs.templates.items():
|
|
if id_ not in self.templates or self.templates[id_] != template:
|
|
self._new_templates = True
|
|
break
|
|
self.templates.update(tfs.templates)
|
|
offset += tfs.length
|
|
else:
|
|
dfs = DataFlowSet(data[offset:], self.templates)
|
|
self.flows += dfs.flows
|
|
offset += dfs.length
|
|
|
|
@property
|
|
def contains_new_templates(self):
|
|
return self._new_templates
|
|
|
|
def __repr__(self):
|
|
s = " and new template(s)" if self.contains_new_templates else ""
|
|
return "<ExportPacket v{} with {} records{}>".format(
|
|
self.header.version, self.header.count, s)
|