Source code for synapse.lib.stormlib.scrape

import asyncio
import logging

import regex

import synapse.common as s_common

import synapse.lib.coro as s_coro
import synapse.lib.scrape as s_scrape
import synapse.lib.spooled as s_spooled
import synapse.lib.stormtypes as s_stormtypes

logger = logging.getLogger(__name__)

[docs]@s_stormtypes.registry.registerLib class LibScrape(s_stormtypes.Lib): ''' A Storm Library for providing helpers for scraping nodes from text. ''' _storm_locals = ( {'name': 'context', 'desc': ''' Attempt to scrape information from a blob of text, getting the context information about the values found. Notes: This does call the ``scrape`` Storm interface if that behavior is enabled on the Cortex. Examples: Scrape some text and make nodes out of it:: for ($form, $valu, $info) in $lib.scrape.context($text) { [ ( *$form ?= $valu ) ] } ''', 'type': {'type': 'function', '_funcname': '_methContext', 'args': ( {'name': 'text', 'type': 'str', 'desc': 'The text to scrape', }, ), 'returns': {'name': 'yields', 'type': 'dict', 'desc': 'A dictionary of scraped values, rule types, and offsets scraped from the text.', }}}, {'name': 'ndefs', 'desc': ''' Attempt to scrape node form, value tuples from a blob of text. Examples: Scrape some text and attempt to make nodes out of it:: for ($form, $valu) in $lib.scrape($text) { [ ( *$form ?= $valu ) ] }''', 'type': {'type': 'function', '_funcname': '_methNdefs', 'args': ( {'name': 'text', 'type': 'str', 'desc': 'The text to scrape', }, ), 'returns': {'name': 'yields', 'type': 'list', 'desc': 'A list of (form, value) tuples scraped from the text.', }}}, {'name': 'genMatches', 'desc': ''' genMatches is a generic helper function for constructing scrape interfaces using pure Storm. It accepts the text, a regex pattern, and produce results that can easily be used to create Notes: The pattern must have a named regular expression match for the key ``valu`` using the named group syntax. For example ``(somekey\\s)(?P<valu>[a-z0-9]+)\\s``. Examples: A scrape implementation with a regex that matches name keys in text:: $re="(Name\\:\\s)(?P<valu>[a-z0-9]+)\\s" $form="ps:name" function scrape(text, form) { $ret = $lib.list() for ($valu, $info) in $lib.scrape.genMatches($text, $re) { $ret.append(($form, $valu, $info)) } return ( $ret ) } ''', 'type': {'type': 'function', '_funcname': '_methGenMatches', 'args': ( {'name': 'text', 'type': 'str', 'desc': 'The text to scrape', }, {'name': 'pattern', 'type': 'str', 'desc': 'The regular expression pattern to match against.', }, {'name': 'fangs', 'type': 'list', 'default': None, 'desc': 'A list of (src, dst) pairs to refang from text. The src must be equal or larger ' 'than the dst in length.'}, {'name': 'flags', 'type': 'int', 'default': regex.IGNORECASE, 'desc': 'Regex flags to use (defaults to IGNORECASE).'}, ), 'returns': {'name': 'yields', 'type': 'list', 'desc': ''}}} ) _storm_lib_path = ('scrape', )
[docs] def getObjLocals(self): return { 'ndefs': self._methNdefs, 'context': self._methContext, 'genMatches': self._methGenMatches, }
async def __call__(self, text, ptype=None, refang=True, unique=True): text = await s_stormtypes.tostr(text) form = await s_stormtypes.tostr(ptype, noneok=True) refang = await s_stormtypes.tobool(refang) unique = await s_stormtypes.tobool(unique) # Remove this in 3.0.0 since it is deprecated. s_common.deprecated('Directly calling $lib.scrape()') await self.runt.warnonce('$lib.scrape() is deprecated. Use $lib.scrape.ndefs().') core = self.runt.snap.core async with await s_spooled.Set.anit(dirn=core.dirn, cell=core) as items: # type: s_spooled.Set async for item in s_scrape.scrapeAsync(text, ptype=form, refang=refang, first=False): if unique: if item in items: continue await items.add(item) yield item await asyncio.sleep(0) @s_stormtypes.stormfunc(readonly=True) async def _methContext(self, text): text = await s_stormtypes.tostr(text) genr = self.runt.snap.view.scrapeIface(text) async for (form, valu, info) in genr: yield (form, valu, info) @s_stormtypes.stormfunc(readonly=True) async def _methNdefs(self, text): text = await s_stormtypes.tostr(text) genr = self.runt.snap.view.scrapeIface(text, unique=True) async for (form, valu, _) in genr: yield (form, valu) @s_stormtypes.stormfunc(readonly=True) async def _methGenMatches(self, text, pattern, fangs=None, flags=regex.IGNORECASE): text = await s_stormtypes.tostr(text) pattern = await s_stormtypes.tostr(pattern) fangs = await s_stormtypes.toprim(fangs) flags = await s_stormtypes.toint(flags) opts = {} regx = regex.compile(pattern, flags=flags) _fangs = None _fangre = None offsets = None scrape_text = text if fangs: _fangs = {src: dst for (src, dst) in fangs} _fangre = s_scrape.genFangRegex(_fangs) if len(text) < s_scrape.SCRAPE_SPAWN_LENGTH: scrape_text, offsets = s_scrape.refang_text2(text, re=_fangre, fangs=_fangs) else: scrape_text, offsets = await s_coro.semafork(s_scrape.refang_text2, text, re=_fangre, fangs=_fangs) async for info in s_scrape.genMatchesAsync(scrape_text, regx, opts=opts): valu = info.pop('valu') if _fangs and offsets: s_scrape._rewriteRawValu(text, offsets, info) yield valu, info