Improve collector script and restructure code

- Moved the netflow library out of the src directory
- The UDP listener was restructured so that multiple threads can receive
  packets and push them into a queue. The main thread then pulls the
  packets off the queue one at a time and processes them. This means
  that the collector will never drop a packet because it was blocked on
  processing the previous one.
- Adds a property to the ExportPacket class to expose if any new
  templates are contained in it.
- The collector will now only retry parsing past packets when a new
  template is found. Also refactored the retry logic a bit to remove
  duplicate code (retrying just pushes the packets back into the main
  queue to be processed again like all the other packets).
- The collector no longer continually reads and writes to/from the disk.
  It just caches the data in memory until it exits instead.
This commit is contained in:
Carey Metcalfe 2019-10-05 03:07:02 -04:00
parent ce2be709d6
commit ef151f8d28
3 changed files with 113 additions and 118 deletions

186
main.py Normal file → Executable file
View file

@ -8,133 +8,123 @@ Copyright 2017-2019 Dominik Pataky <dev@bitkeks.eu>
Licensed under MIT License. See LICENSE. Licensed under MIT License. See LICENSE.
""" """
import logging
import argparse import argparse
from collections import namedtuple
from queue import Queue
import json
import logging
import sys import sys
import socketserver import socketserver
import threading
import time import time
import json
import os.path from netflow.v9 import ExportPacket, TemplateNotRecognized
logging.getLogger().setLevel(logging.INFO) __log__ = logging.getLogger(__name__)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
logging.getLogger().addHandler(ch)
try: # Amount of time to wait before dropping an undecodable ExportPacket
from netflow.collector_v9 import ExportPacket, TemplateNotRecognized PACKET_TIMEOUT = 60 * 60
except ImportError:
logging.warning("Netflow v9 not installed as package! Running from directory.")
from src.netflow.collector_v9 import ExportPacket, TemplateNotRecognized
parser = argparse.ArgumentParser(description="A sample netflow collector.") RawPacket = namedtuple('RawPacket', ['ts', 'data'])
parser.add_argument("--host", type=str, default="",
help="collector listening address")
parser.add_argument("--port", "-p", type=int, default=2055,
help="collector listener port")
parser.add_argument("--file", "-o", type=str, dest="output_file",
default="{}.json".format(int(time.time())),
help="collector export JSON file")
parser.add_argument("--debug", "-D", action="store_true",
help="Enable debug output")
class SoftflowUDPHandler(socketserver.BaseRequestHandler):
# We need to save the templates our NetFlow device
# send over time. Templates are not resended every
# time a flow is sent to the collector.
templates = {}
buffered = {}
@classmethod
def set_output_file(cls, path):
cls.output_file = path
class QueuingRequestHandler(socketserver.BaseRequestHandler):
def handle(self): def handle(self):
if not os.path.exists(self.output_file):
with open(self.output_file, 'w') as fh:
json.dump({}, fh)
with open(self.output_file, 'r') as fh:
try:
existing_data = json.load(fh)
except json.decoder.JSONDecodeError as ex:
logging.error("Malformed JSON output file. Cannot read existing data, aborting.")
return
data = self.request[0] data = self.request[0]
host = self.client_address[0] self.server.queue.put(RawPacket(time.time(), data))
logging.debug("Received data from {}, length {}".format(host, len(data))) __log__.debug(
"Recieved %d bytes of data from %s", len(data), self.client_address[0]
)
export = None
try:
export = ExportPacket(data, self.templates)
except TemplateNotRecognized:
self.buffered[time.time()] = data
logging.warning("Received data with unknown template, data stored in buffer!")
return
if not export: class QueuingUDPListener(socketserver.ThreadingUDPServer):
logging.error("Error with exception handling while disecting export, export is None") """A threaded UDP server that adds a (time, data) tuple to a queue for
return every request it sees
"""
def __init__(self, interface, queue):
self.queue = queue
super().__init__(interface, QueuingRequestHandler)
logging.debug("Processed ExportPacket with {} flows.".format(export.header.count))
logging.debug("Size of buffer: {}".format(len(self.buffered)))
# In case the export held some new templates def get_export_packets(host, port):
self.templates.update(export.templates) """A generator that will yield ExportPacket objects until it is killed
or has a truthy value sent to it"""
remain_buffered = {} __log__.info("Starting the NetFlow listener on {}:{}".format(host, port))
processed = [] queue = Queue()
for timestamp, data in self.buffered.items(): server = QueuingUDPListener((host, port), queue)
thread = threading.Thread(target=server.serve_forever)
thread.start()
# Process packets from the queue
templates = {}
to_retry = []
try:
while True:
pkt = queue.get()
try: try:
buffered_export = ExportPacket(data, self.templates) export = ExportPacket(pkt.data, templates)
processed.append(timestamp)
except TemplateNotRecognized: except TemplateNotRecognized:
remain_buffered[timestamp] = data if time.time() - pkt.ts > PACKET_TIMEOUT:
logging.debug("Template of buffered ExportPacket still not recognized") __log__.warning("Dropping an old and undecodable ExportPacket")
else:
to_retry.append(pkt)
__log__.debug("Failed to decode an ExportPacket - will "
"re-attempt when a new template is dicovered")
continue continue
logging.debug("Processed buffered ExportPacket with {} flows.".format(buffered_export.header.count))
existing_data[timestamp] = [flow.data for flow in buffered_export.flows]
# Delete processed items from the buffer __log__.debug("Processed an ExportPacket with %d flows.",
for pro in processed: export.header.count)
del self.buffered[pro]
# Update the buffer # If any new templates were discovered, dump the unprocessable
self.buffered.update(remain_buffered) # data back into the queue and try to decode them again
if export.contains_new_templates and to_retry:
# Append new flows __log__.debug("Recieved new template(s)")
existing_data[time.time()] = [flow.data for flow in export.flows] __log__.debug("Will re-attempt to decode %d old ExportPackets",
len(to_retry))
with open(self.output_file, 'w') as fh: for p in to_retry:
json.dump(existing_data, fh) queue.put(p)
to_retry.clear()
stop = yield pkt.ts, export
if stop:
break
finally:
__log__.info("Shutting down the NetFlow listener")
server.shutdown()
server.server_close()
thread.join()
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="A sample netflow collector.")
parser.add_argument("--host", type=str, default="0.0.0.0",
help="collector listening address")
parser.add_argument("--port", "-p", type=int, default=2055,
help="collector listener port")
parser.add_argument("--file", "-o", type=str, dest="output_file",
default="{}.json".format(int(time.time())),
help="collector export JSON file")
parser.add_argument("--debug", "-D", action="store_true",
help="Enable debug output")
args = parser.parse_args() args = parser.parse_args()
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format="%(message)s")
if args.debug: if args.debug:
logging.getLogger().setLevel(logging.DEBUG) __log__.setLevel(logging.DEBUG)
output_file = args.output_file
SoftflowUDPHandler.set_output_file(output_file)
host = args.host
port = args.port
logging.info("Listening on interface {}:{}".format(host, port))
server = socketserver.UDPServer((host, port), SoftflowUDPHandler)
data = {}
try: try:
logging.debug("Starting the NetFlow listener") # TODO: For a long-running processes, this will consume loads of memory
server.serve_forever(poll_interval=0.5) for ts, export in get_export_packets(args.host, args.port):
except (IOError, SystemExit): data[ts] = [flow.data for flow in export.flows]
raise
except KeyboardInterrupt: except KeyboardInterrupt:
raise pass
server.server_close() if data:
__log__.info("Outputting collected data to '%s'", args.output_file)
with open(args.output_file, 'w') as f:
json.dump(data, f)
else:
__log__.info("No data collected")

View file

@ -11,12 +11,10 @@ Copyright 2017, 2018 Dominik Pataky <dev@bitkeks.eu>
Licensed under MIT License. See LICENSE. Licensed under MIT License. See LICENSE.
""" """
import socket
import struct import struct
import sys
field_types = { FIELD_TYPES = {
0: 'UNKNOWN_FIELD_TYPE', # fallback for unknown field types 0: 'UNKNOWN_FIELD_TYPE', # fallback for unknown field types
# Cisco specs for NetFlow v9 # Cisco specs for NetFlow v9
@ -153,10 +151,14 @@ field_types = {
} }
class TemplateNotRecognized(KeyError):
pass
class DataRecord: class DataRecord:
"""This is a 'flow' as we want it from our source. What it contains is """This is a 'flow' as we want it from our source. What it contains is
variable in NetFlow V9, so to work with the data you have to analyze the variable in NetFlow V9, so to work with the data you have to analyze the
data dict keys (which are integers and can be mapped with the field_types data dict keys (which are integers and can be mapped with the FIELD_TYPES
dict). dict).
Should hold a 'data' dict with keys=field_type (integer) and value (in bytes). Should hold a 'data' dict with keys=field_type (integer) and value (in bytes).
@ -195,7 +197,7 @@ class DataFlowSet:
for field in template.fields: for field in template.fields:
flen = field.field_length flen = field.field_length
fkey = field_types[field.field_type] fkey = FIELD_TYPES[field.field_type]
fdata = None fdata = None
# The length of the value byte slice is defined in the template # The length of the value byte slice is defined in the template
@ -218,20 +220,18 @@ class DataFlowSet:
class TemplateField: class TemplateField:
"""A field with type identifier and length. """A field with type identifier and length."""
"""
def __init__(self, field_type, field_length): def __init__(self, field_type, field_length):
self.field_type = field_type # integer self.field_type = field_type # integer
self.field_length = field_length # bytes self.field_length = field_length # bytes
def __repr__(self): def __repr__(self):
return "<TemplateField type {}:{}, length {}>".format( return "<TemplateField type {}:{}, length {}>".format(
self.field_type, field_types[self.field_type], self.field_length) self.field_type, FIELD_TYPES[self.field_type], self.field_length)
class TemplateRecord: class TemplateRecord:
"""A template record contained in a TemplateFlowSet. """A template record contained in a TemplateFlowSet."""
"""
def __init__(self, template_id, field_count, fields): def __init__(self, template_id, field_count, fields):
self.template_id = template_id self.template_id = template_id
self.field_count = field_count self.field_count = field_count
@ -240,7 +240,7 @@ class TemplateRecord:
def __repr__(self): def __repr__(self):
return "<TemplateRecord {} with {} fields: {}>".format( return "<TemplateRecord {} with {} fields: {}>".format(
self.template_id, self.field_count, self.template_id, self.field_count,
' '.join([field_types[field.field_type] for field in self.fields])) ' '.join([FIELD_TYPES[field.field_type] for field in self.fields]))
class TemplateFlowSet: class TemplateFlowSet:
@ -268,7 +268,7 @@ class TemplateFlowSet:
# Get all fields of this template # Get all fields of this template
offset += 4 offset += 4
field_type, field_length = struct.unpack('!HH', data[offset:offset+4]) field_type, field_length = struct.unpack('!HH', data[offset:offset+4])
if field_type not in field_types: if field_type not in FIELD_TYPES:
field_type = 0 # Set field_type to UNKNOWN_FIELD_TYPE as fallback field_type = 0 # Set field_type to UNKNOWN_FIELD_TYPE as fallback
field = TemplateField(field_type, field_length) field = TemplateField(field_type, field_length)
fields.append(field) fields.append(field)
@ -288,8 +288,7 @@ class TemplateFlowSet:
class Header: class Header:
"""The header of the ExportPacket. """The header of the ExportPacket."""
"""
def __init__(self, data): def __init__(self, data):
pack = struct.unpack('!HHIIII', data[:20]) pack = struct.unpack('!HHIIII', data[:20])
@ -302,11 +301,11 @@ class Header:
class ExportPacket: class ExportPacket:
"""The flow record holds the header and all template and data flowsets. """The flow record holds the header and all template and data flowsets."""
"""
def __init__(self, data, templates): def __init__(self, data, templates):
self.header = Header(data) self.header = Header(data)
self.templates = templates self.templates = templates
self._new_templates = False
self.flows = [] self.flows = []
offset = 20 offset = 20
@ -314,6 +313,12 @@ class ExportPacket:
flowset_id = struct.unpack('!H', data[offset:offset+2])[0] flowset_id = struct.unpack('!H', data[offset:offset+2])[0]
if flowset_id == 0: # TemplateFlowSet always have id 0 if flowset_id == 0: # TemplateFlowSet always have id 0
tfs = TemplateFlowSet(data[offset:]) tfs = TemplateFlowSet(data[offset:])
# Check for any new/changed templates
if not self._new_templates:
for id_, template in tfs.templates.items():
if id_ not in self.templates or self.templates[id_] != template:
self._new_templates = True
break
self.templates.update(tfs.templates) self.templates.update(tfs.templates)
offset += tfs.length offset += tfs.length
else: else:
@ -321,10 +326,10 @@ class ExportPacket:
self.flows += dfs.flows self.flows += dfs.flows
offset += dfs.length offset += dfs.length
@property
def contains_new_templates(self):
return self._new_templates
def __repr__(self): def __repr__(self):
return "<ExportPacket version {} counting {} records>".format( return "<ExportPacket version {} counting {} records>".format(
self.header.version, self.header.count) self.header.version, self.header.count)
class TemplateNotRecognized(KeyError):
pass