""" Plugin to find python strings within process heaps. """ import os import re import struct from itertools import groupby from volatility import debug as debug from volatility import obj as obj from volatility.plugins.linux import common as linux_common from volatility.plugins.linux import pslist as linux_pslist from volatility.renderers import TreeGrid from volatility import utils # Note: It doesn't actually matter if Py_TRACE_REF is defined, that just means # there are more structures at the beginning, which we don't care about pyobjs_vtype_64 = { '_PyStringObject': [ 37, { 'ob_refcnt': [0, ['long long']], # Py_ssize_t = ssize_t 'ob_type': [8, ['pointer', ['void']]], # struct _typeobject * 'ob_size': [16, ['long long']], # Py_ssize_t = ssize_t 'ob_shash': [24, ['long long']], 'ob_sstate': [32, ['Enumeration', dict(target='int', choices={ 0: 'SSTATE_NOT_INTERNED', 1: 'SSTATE_INTERNED_MORTAL', 2: 'SSTATE_INTERNED_IMMORTAL' })]], 'ob_sval': [36, ['array', 1, ['char']]] }], '_PyDictEntry': [ 24, { 'me_hash': [0, ['long long']], # Py_ssize_t = ssize_t 'me_key': [8, ['pointer', ['_PyStringObject']]], 'me_value': [16, ['pointer', ['_PyStringObject']]] }] } class _PyStringObject(obj.CType): r""" A class for python string objects. ---- stringobject.h ---- typedef struct { PyObject_VAR_HEAD long ob_shash; int ob_sstate; char ob_sval[1]; /* Invariants: * ob_sval contains space for 'ob_size+1' elements. * ob_sval[ob_size] == 0. * ob_shash is the hash of the string or -1 if not computed yet. * ob_sstate != 0 iff the string object is in stringobject.c's * 'interned' dictionary; in this case the two references * from 'interned' to this object are *not counted* in * ob_refcnt. */ } PyStringObject; #define SSTATE_NOT_INTERNED 0 #define SSTATE_INTERNED_MORTAL 1 #define SSTATE_INTERNED_IMMORTAL 2 ---- object.h - note that _PyObject_HEAD_EXTRA is empty if Py_TRACE_REFs is not defined ---- /* PyObject_HEAD defines the initial segment of every PyObject. */ #define PyObject_HEAD \ _PyObject_HEAD_EXTRA \ Py_ssize_t ob_refcnt; \ struct _typeobject *ob_type; #define PyObject_VAR_HEAD \ PyObject_HEAD \ Py_ssize_t ob_size; /* Number of items in variable part */ """ def is_valid(self): """ Determine whether the Python string struct is valid - an easy way to check is to calculate the hash of the string, and see if it matches the `ob_shash`. On Python 2.7, the hash function used is FNV. This assumes that the python version volatility is using matches the python version of the memory dump, because it uses the `hash()` function to compute the hash. """ ob_sval_offset, _ = self.members['ob_sval'] string_address = self.obj_offset + ob_sval_offset return ( self.ob_type.is_valid() and # skip empty strings and strings that are too big self.ob_size > 0 and self.ob_size <= 1e6 and # state must be one of the valid states self.ob_sstate.v() in self.ob_sstate.choices.keys() and # the string should be null-terminated self.obj_vm.zread(string_address + self.ob_size, 1) == '\x00' and # the hash may not have been computed (-1), but otherwise # it should be correct (self.ob_shash == -1 or self.ob_shash == hash(self.string))) @property def string(self): """ Read the string from memory, because `ob_sval` is a :class:`volatility.obj.NativeType.Array` object, which is slow to iterate through to turn into a string. """ sval_offset, _ = self.members['ob_sval'] return self.obj_vm.zread(self.obj_offset + sval_offset, self.ob_size) class _StringStringPyDictEntry(obj.CType): r""" ---- dictobject.h ---- typedef struct { Py_ssize_t me_hash; PyObject *me_key; PyObject *me_value; } PyDictEntry; ---- object.h ---- /* Nothing is actually declared to be a PyObject, but every pointer to * a Python object can be cast to a PyObject*. This is inheritance built * by hand. Similarly every pointer to a variable-size Python object can, * in addition, be cast to PyVarObject*. */ typedef struct _object { PyObject_HEAD } PyObject; """ def is_valid(self): """ Determine whether the {Python string key: Python string val} PyDictEntry struct is valid. Both pointers should be valid, and the hash of the entry should be the same as the hash of the key. """ if self.me_key.is_valid() and self.me_value.is_valid(): key = self.key if key.is_valid() and key.ob_shash == self.me_hash: return self.value.is_valid() return False @property def key(self): return self.me_key.dereference() @property def value(self): return self.me_value.dereference() class PythonStringTypes(obj.ProfileModification): """ Profile modifications for Python string types. Only Linux and Mac OS, on 64-bit systems, are supported right now. """ conditions = {"os": lambda x: x in ["linux", "mac"], "memory_model": lambda x: x == "64bit"} def modification(self, profile): """ Add python string overlays to the profile's vtypes. """ profile.vtypes.update(pyobjs_vtype_64) profile.object_classes.update({ "_PyStringObject": _PyStringObject, "_PyDictEntry": _StringStringPyDictEntry }) def brute_force_search(addr_space, obj_type_string, start, end, step_size=1): """ Brute-force search an area of memory for a given object type. Returns valid types as a generator. """ offset = start while offset < end: found_object = obj.Object(obj_type_string, offset=offset, vm=addr_space) if found_object.is_valid(): yield found_object offset += found_object.size() + found_object.ob_size else: offset += step_size def _brute_force_5_strings(addr_space, heaps): """ Search the heaps 5K at a time until 5 strings are found. Why 5? Arbitrary. Just so long as it's not 1, which may be a false positive. """ bfed_strings = [] chunk_size = 1024 * 5 for heap_vma in heaps: for chunk_start in xrange(heap_vma.vm_start, heap_vma.vm_end, chunk_size): bfed_strings.extend(list(brute_force_search( addr_space=addr_space, obj_type_string="_PyStringObject", start=chunk_start, end=chunk_start + chunk_size - 1, step_size=4))) if len(bfed_strings) >= 5: return bfed_strings def find_python_strings(task): """ Attempt to find python strings. Brute-force search is pretty slow, so we are going to optimize a bit. The `ob_type` of a PyObjString is a pretty involved struct, so we are not searching on that pattern, but all Python strings should point to the same type in memory. We will brute-force search the heaps only until a couple of strings are found. We want to make sure that they all point to the same type in memory. Once we have a good guess at where that type resides in memory, we can search specifically for that address value and use that as a hint as to where there might be a PyObjString. We want to search the rest of memory though """ addr_space = task.get_process_address_space() heaps_and_anon = get_heaps_and_anon(task) likely_strings = _brute_force_5_strings(addr_space, heaps_and_anon) likely_strings_by_type = { pointer: list(strings) for pointer, strings in groupby(likely_strings, lambda pystr: pystr.ob_type) } debug.debug("Found {0} possible str _typeobject pointer(s): {1}".format( len(likely_strings_by_type), ", ".join([ "0x{0:012x} ({1})".format(pointer.v(), len(strings)) for pointer, strings in likely_strings_by_type.iteritems()]))) memory_model = addr_space.profile.metadata.get('memory_model', '32bit') pack_format = "I" if memory_model == '32bit' else "Q" offset = addr_space.profile.get_obj_offset("_PyStringObject", "ob_type") str_types_as_bytes = [struct.pack(pack_format, pointer.v()) for pointer in likely_strings_by_type] for address in search_vmas(str_types_as_bytes, heaps_and_anon, task): # We will find the likely_strings again, but that's ok py_string = obj.Object("_PyStringObject", offset=address - offset, vm=addr_space) if py_string.is_valid(): yield py_string def search_vmas(s, vmas, task): """ Searches VMAs for lists of strings. volatility.plugins.overlays.linux.linux.task_struct.search_process_memory could be used, but we want to search more than the heap and less than all of process memory. This code is mostly copied from there. """ # Allow for some overlap in case objects are # right on page boundaries overlap = 1024 scan_blk_sz = 1024 * 1024 * 10 addr_space = task.get_process_address_space() for vma in vmas: offset = vma.vm_start out_of_range = vma.vm_start + (vma.vm_end - vma.vm_start) while offset < out_of_range: # Read some data and match it. to_read = min(scan_blk_sz + overlap, out_of_range - offset) data = addr_space.zread(offset, to_read) if not data: break for x in s: for hit in utils.iterfind(data, x): yield offset + hit offset += min(to_read, scan_blk_sz) def get_heaps_and_anon(task): """ Given a task, return the mapped sections corresponding to that task's heaps and anonymous mappings (since CPython sometimes mmaps things). """ for vma in task.get_proc_maps(): if (vma.vm_start <= task.mm.start_brk and vma.vm_end >= task.mm.brk): yield vma elif vma.vm_name(task) == "Anonymous Mapping": yield vma def _is_python_task(task): """ Return true if this is a python task (as per the executable name, not necessarily by task name), false otherwise. """ code_area = [vma for vma in task.get_proc_maps() if (task.mm.start_code >= vma.vm_start and task.mm.end_code <= vma.vm_end)] return code_area and 'python' in code_area[0].vm_name(task) class linux_python_strings(linux_pslist.linux_pslist): """ Pull python strings from a process's heap. """ def __init__(self, config, *args, **kwargs): """ Add a configuration for checking strings, basically a regex to check for. """ linux_pslist.linux_pslist.__init__(self, config, *args, **kwargs) self._config.add_option( 'REGEX', default='None', type='string', help='Provide a regex: only return strings that match the regex.') self._config.add_option( 'DUMP-DIR', default='None', type='string', help='Output strings to file(s) in this dump directory.') def _validate_config(self): """ Check the config values, and converts them to the right value. """ if self._config.REGEX: self._config.REGEX = re.compile(self._config.REGEX) if (self._config.DUMP_DIR is not None and not os.path.isdir(os.path.expanduser(self._config.DUMP_DIR))): debug.error(self._config.DUMP_DIR + " is not a directory") self._config.DUMP_DIR = None def calculate(self): """ Find the tasks that are actually python processes. May not necessarily be called "python", but the executable is python. Then find all python strings in that process's heap. """ linux_common.set_plugin_members(self) self._validate_config() tasks = [task for task in linux_pslist.linux_pslist.calculate(self) if _is_python_task(task)] for task in tasks: for py_string in find_python_strings(task): if (self._config.REGEX is None or self._config.REGEX.match(py_string.string)): yield task, py_string def unified_output(self, data): """ Return a TreeGrid with data to print out. """ return TreeGrid([("Pid", int), ("Name", str), ("Size", int), ("String", str)], self.generator(data)) def generator(self, data): """ If writing to a file is desired, write to a file. Also generate data that may be formatted for printing. """ files = {} for task, py_string in data: if self._config.DUMP_DIR is not None: filename = "{0}.{1}.strings".format(task.pid, task.comm) if task.pid not in files: files[task.pid] = open(os.path.expanduser(os.path.join( self._config.DUMP_DIR, filename)), 'wb') files[task.pid].write(repr(py_string.string)) files[task.pid].write("\n") yield (0, [int(task.pid), str(task.comm), int(py_string.ob_size), py_string.string]) for file_handle in files.values(): file_handle.close() def render_text(self, outfd, data): self.table_header(outfd, [("Pid", "15"), ("Name", "10"), ("Size", "10"), ("String", "50")]) for _, output in self.generator(data): self.table_row(outfd, *[str(o) for o in output]) class linux_python_str_dict_entry(linux_pslist.linux_pslist): """ Pull {python-strings: python-string} dictionary entries from a process's heap. """ def calculate(self): """ Get all the python strings for a task, and assume those strings might be keys of a dictionary entry. Return the valid dictionary entries from that pool of maybes. This repeats a lot of linux_python_strings's code, but we want to get python strings per task, so we can optimize the bytstring search. """ linux_common.set_plugin_members(self) tasks = [task for task in linux_pslist.linux_pslist.calculate(self) if _is_python_task(task)] for task in tasks: addr_space = task.get_process_address_space() memory_model = addr_space.profile.metadata.get('memory_model', '32bit') pack_format = "I" if memory_model == '32bit' else "Q" bytestrings = [ # the hash as bytes struct.pack(pack_format.lower(), py_string.ob_shash) + # the pointer the PyStringObject as bytes struct.pack(pack_format, py_string.obj_offset) for py_string in find_python_strings(task) ] for address in task.search_process_memory(bytestrings, heap_only=True): py_dict_entry = obj.Object("_PyDictEntry", offset=address, vm=addr_space) if py_dict_entry.is_valid(): yield task, py_dict_entry def unified_output(self, data): """ Return a TreeGrid with data to print out. """ return TreeGrid([("Pid", int), ("Name", str), ("Key", str), ("Value", str)], self.generator(data)) def generator(self, data): """ Generate data that may be formatted for printing. """ for task, py_dict_entry in data: yield (0, [int(task.pid), str(task.comm), py_dict_entry.key.string, py_dict_entry.value.string])