import asyncio
import logging
import regex
import synapse.common as s_common
import synapse.lib.coro as s_coro
import synapse.lib.scrape as s_scrape
import synapse.lib.spooled as s_spooled
import synapse.lib.stormtypes as s_stormtypes
logger = logging.getLogger(__name__)
class LibScrape(s_stormtypes.Lib):
A Storm Library for providing helpers for scraping nodes from text.
_storm_locals = (
{'name': 'context', 'desc': '''
Attempt to scrape information from a blob of text, getting the context information about the values found.
This does call the ``scrape`` Storm interface if that behavior is enabled on the Cortex.
Scrape some text and make nodes out of it::
for ($form, $valu, $info) in $lib.scrape.context($text) {
[ ( *$form ?= $valu ) ]
'type': {'type': 'function', '_funcname': '_methContext',
'args': (
{'name': 'text', 'type': 'str',
'desc': 'The text to scrape', },
'returns': {'name': 'yields', 'type': 'dict',
'desc': 'A dictionary of scraped values, rule types, and offsets scraped from the text.',
{'name': 'ndefs', 'desc': '''
Attempt to scrape node form, value tuples from a blob of text.
Scrape some text and attempt to make nodes out of it::
for ($form, $valu) in $lib.scrape($text) {
[ ( *$form ?= $valu ) ]
'type': {'type': 'function', '_funcname': '_methNdefs',
'args': (
{'name': 'text', 'type': 'str',
'desc': 'The text to scrape', },
'returns': {'name': 'yields', 'type': 'list',
'desc': 'A list of (form, value) tuples scraped from the text.', }}},
{'name': 'genMatches', 'desc': '''
genMatches is a generic helper function for constructing scrape interfaces using pure Storm.
It accepts the text, a regex pattern, and produce results that can easily be used to create
The pattern must have a named regular expression match for the key ``valu`` using the
named group syntax. For example ``(somekey\\s)(?P<valu>[a-z0-9]+)\\s``.
A scrape implementation with a regex that matches name keys in text::
function scrape(text, form) {
$ret = ()
for ($valu, $info) in $lib.scrape.genMatches($text, $re) {
$ret.append(($form, $valu, $info))
return ( $ret )
'type': {'type': 'function', '_funcname': '_methGenMatches',
'args': (
{'name': 'text', 'type': 'str',
'desc': 'The text to scrape', },
{'name': 'pattern', 'type': 'str',
'desc': 'The regular expression pattern to match against.', },
{'name': 'fangs', 'type': 'list', 'default': None,
'desc': 'A list of (src, dst) pairs to refang from text. The src must be equal or larger '
'than the dst in length.'},
{'name': 'flags', 'type': 'int', 'default': regex.IGNORECASE,
'desc': 'Regex flags to use (defaults to IGNORECASE).'},
'returns': {'name': 'yields', 'type': 'list',
'desc': ''}}}
_storm_lib_path = ('scrape', )
def getObjLocals(self):
return {
'ndefs': self._methNdefs,
'context': self._methContext,
'genMatches': self._methGenMatches,
async def __call__(self, text, ptype=None, refang=True, unique=True):
text = await s_stormtypes.tostr(text)
form = await s_stormtypes.tostr(ptype, noneok=True)
refang = await s_stormtypes.tobool(refang)
unique = await s_stormtypes.tobool(unique)
# Remove this in 3.0.0 since it is deprecated.
s_common.deprecated('Directly calling $lib.scrape()')
await self.runt.warnonce('$lib.scrape() is deprecated. Use $lib.scrape.ndefs().')
core = self.runt.snap.core
async with await s_spooled.Set.anit(dirn=core.dirn, cell=core) as items: # type: s_spooled.Set
async for item in s_scrape.scrapeAsync(text, ptype=form, refang=refang, first=False):
if unique:
if item in items:
await items.add(item)
yield item
await asyncio.sleep(0)
async def _methContext(self, text):
text = await s_stormtypes.tostr(text)
genr = self.runt.snap.view.scrapeIface(text)
async for (form, valu, info) in genr:
yield (form, valu, info)
async def _methNdefs(self, text):
text = await s_stormtypes.tostr(text)
genr = self.runt.snap.view.scrapeIface(text, unique=True)
async for (form, valu, _) in genr:
yield (form, valu)
async def _methGenMatches(self, text, pattern, fangs=None, flags=regex.IGNORECASE):
text = await s_stormtypes.tostr(text)
pattern = await s_stormtypes.tostr(pattern)
fangs = await s_stormtypes.toprim(fangs)
flags = await s_stormtypes.toint(flags)
opts = {}
regx = regex.compile(pattern, flags=flags)
_fangs = None
_fangre = None
offsets = None
scrape_text = text
if fangs:
_fangs = {src: dst for (src, dst) in fangs}
_fangre = s_scrape.genFangRegex(_fangs)
scrape_text, offsets = await s_coro.semafork(s_scrape.refang_text2, text, re=_fangre, fangs=_fangs)
async for info in s_scrape.genMatchesAsync(scrape_text, regx, opts=opts):
valu = info.pop('valu')
if _fangs and offsets:
s_scrape._rewriteRawValu(text, offsets, info)
yield valu, info