Adversaries may exfiltrate data by transferring large volumes to the public internet, which could indicate data theft or espionage. SOC teams should proactively hunt for this behavior in Azure Sentinel to detect and mitigate potential data exfiltration campaigns early.
KQL Query
let starttime = 14d;
let endtime = 1d;
let timeframe = 1h;
let scorethreshold = 5;
let bytessentperhourthreshold = 10;
let TimeSeriesData = (union isfuzzy=true
(
VMConnection
| where TimeGenerated between (startofday(ago(starttime))..startofday(ago(endtime)))
| where isnotempty(DestinationIp) and isnotempty(SourceIp)
| extend SourceIP = SourceIp, DestinationIP = DestinationIp
| where ipv4_is_private(DestinationIP) == false
| extend DeviceVendor = "VMConnection"
| project TimeGenerated, BytesSent, DeviceVendor
| make-series TotalBytesSent=sum(BytesSent) on TimeGenerated from startofday(ago(starttime)) to startofday(ago(endtime)) step timeframe by DeviceVendor
),
(
CommonSecurityLog
| where TimeGenerated between (startofday(ago(starttime))..startofday(ago(endtime)))
| where isnotempty(DestinationIP) and isnotempty(SourceIP)
| where ipv4_is_private(DestinationIP) == false
| project TimeGenerated, SentBytes, DeviceVendor
| make-series TotalBytesSent=sum(SentBytes) on TimeGenerated from startofday(ago(starttime)) to startofday(ago(endtime)) step timeframe by DeviceVendor
)
);
//Filter anomolies against TimeSeriesData
let TimeSeriesAlerts = materialize(TimeSeriesData
| extend (anomalies, score, baseline) = series_decompose_anomalies(TotalBytesSent, scorethreshold, -1, 'linefit')
| mv-expand TotalBytesSent to typeof(double), TimeGenerated to typeof(datetime), anomalies to typeof(double),score to typeof(double), baseline to typeof(long)
| where anomalies > 0 | extend AnomalyHour = TimeGenerated
| extend TotalBytesSentinMBperHour = round(((TotalBytesSent / 1024)/1024),2), baselinebytessentperHour = round(((baseline / 1024)/1024),2), score = round(score,2)
| project DeviceVendor, AnomalyHour, TimeGenerated, TotalBytesSentinMBperHour, baselinebytessentperHour, anomalies, score);
let AnomalyHours = materialize(TimeSeriesAlerts | where TimeGenerated > ago(2d) | project TimeGenerated);
//Union of all BaseLogs aggregated per hour
let BaseLogs = (union isfuzzy=true
(
CommonSecurityLog
| where isnotempty(DestinationIP) and isnotempty(SourceIP)
| where TimeGenerated > ago(2d)
| extend DateHour = bin(TimeGenerated, 1h) // create a new column and round to hour
| where DateHour in ((AnomalyHours)) //filter the dataset to only selected anomaly hours
| where ipv4_is_private(DestinationIP) == false
| extend SentBytesinMB = ((SentBytes / 1024)/1024), ReceivedBytesinMB = ((ReceivedBytes / 1024)/1024)
| summarize HourlyCount = count(), TimeGeneratedMax=arg_max(TimeGenerated, *), DestinationIPList=make_set(DestinationIP, 100), DestinationPortList = make_set(DestinationPort,100), TotalSentBytesinMB = sum(SentBytesinMB), TotalReceivedBytesinMB = sum(ReceivedBytesinMB) by SourceIP, DeviceVendor, TimeGeneratedHour=bin(TimeGenerated,1h)
| where TotalSentBytesinMB > bytessentperhourthreshold
| sort by TimeGeneratedHour asc, TotalSentBytesinMB desc
| extend Rank=row_number(1, prev(TimeGeneratedHour) != TimeGeneratedHour) // Ranking the dataset per Hourly Partition
| where Rank < 10 // Selecting Top 10 records with Highest BytesSent in each Hour
| project DeviceVendor, TimeGeneratedHour, TimeGeneratedMax, SourceIP, DestinationIPList, DestinationPortList, TotalSentBytesinMB, TotalReceivedBytesinMB, Rank
),
(
VMConnection
| where isnotempty(DestinationIp) and isnotempty(SourceIp)
| where TimeGenerated > ago(2d)
| extend DateHour = bin(TimeGenerated, 1h) // create a new column and round to hour
| where DateHour in ((AnomalyHours)) //filter the dataset to only selected anomaly hours
| extend SourceIP = SourceIp, DestinationIP = DestinationIp
| where ipv4_is_private(DestinationIP) == false | extend DeviceVendor = "VMConnection"
| extend SentBytesinMB = ((BytesSent / 1024)/1024), ReceivedBytesinMB = ((BytesReceived / 1024)/1024)
| summarize HourlyCount = count(),TimeGeneratedMax=arg_max(TimeGenerated, *), DestinationIPList=make_set(DestinationIP, 100), DestinationPortList = make_set(DestinationPort, 100), TotalSentBytesinMB = sum(SentBytesinMB),TotalReceivedBytesinMB = sum(ReceivedBytesinMB) by SourceIP, DeviceVendor, TimeGeneratedHour=bin(TimeGenerated,1h)
| where TotalSentBytesinMB > bytessentperhourthreshold
| sort by TimeGeneratedHour asc, TotalSentBytesinMB desc
| extend Rank=row_number(1, prev(TimeGeneratedHour) != TimeGeneratedHour) // Ranking the dataset per Hourly Partition
| where Rank < 10 // Selecting Top 10 records with Highest BytesSent in each Hour
| project DeviceVendor, TimeGeneratedHour, TimeGeneratedMax, SourceIP, DestinationIPList, DestinationPortList, TotalSentBytesinMB, TotalReceivedBytesinMB, Rank
)
);
// Join against base logs to retrive records associated with the hour of anomoly
TimeSeriesAlerts
| where TimeGenerated > ago(2d)
| join (
BaseLogs | extend AnomalyHour = TimeGeneratedHour
) on DeviceVendor, AnomalyHour | sort by score desc
| project DeviceVendor, AnomalyHour,TimeGeneratedMax, SourceIP, DestinationIPList, DestinationPortList, TotalSentBytesinMB, TotalReceivedBytesinMB, TotalBytesSentinMBperHour, baselinebytessentperHour, score, anomalies
| summarize EventCount = count(), StartTimeUtc= min(TimeGeneratedMax), EndTimeUtc= max(TimeGeneratedMax), SourceIPMax= arg_max(SourceIP,*), TotalBytesSentinMB = sum(TotalSentBytesinMB), TotalBytesReceivedinMB = sum(TotalReceivedBytesinMB), SourceIPList = make_set(SourceIP, 100), DestinationIPList = make_set(DestinationIPList, 100) by AnomalyHour,TotalBytesSentinMBperHour, baselinebytessentperHour, score, anomalies
| project DeviceVendor, AnomalyHour, StartTimeUtc, EndTimeUtc, SourceIPMax, SourceIPList, DestinationIPList, DestinationPortList, TotalBytesSentinMB, TotalBytesReceivedinMB, TotalBytesSentinMBperHour, baselinebytessentperHour, score, anomalies, EventCount
id: f2dd4a3a-ebac-4994-9499-1a859938c947
name: Time series anomaly for data size transferred to public internet
description: |
'Identifies anomalous data transfer to public networks. The query leverages built-in KQL anomaly detection algorithms that detects large deviations from a baseline pattern.
A sudden increase in data transferred to unknown public networks is an indication of data exfiltration attempts and should be investigated.
The higher the score, the further it is from the baseline value.
The output is aggregated to provide summary view of unique source IP to destination IP address and port bytes sent traffic observed in the flagged anomaly hour.
The source IP addresses which were sending less than bytessentperhourthreshold have been exluded whose value can be adjusted as needed .
You may have to run queries for individual source IP addresses from SourceIPlist to determine if anything looks suspicious'
severity: Medium
requiredDataConnectors:
- connectorId: CiscoASA
dataTypes:
- CommonSecurityLog
- connectorId: PaloAltoNetworks
dataTypes:
- CommonSecurityLog
- connectorId: AzureMonitor(VMInsights)
dataTypes:
- VMConnection
queryFrequency: 1d
queryPeriod: 14d
triggerOperator: gt
triggerThreshold: 1
tactics:
- Exfiltration
relevantTechniques:
- T1030
tags:
- DEV-0537
query: |
let starttime = 14d;
let endtime = 1d;
let timeframe = 1h;
let scorethreshold = 5;
let bytessentperhourthreshold = 10;
let TimeSeriesData = (union isfuzzy=true
(
VMConnection
| where TimeGenerated between (startofday(ago(starttime))..startofday(ago(endtime)))
| where isnotempty(DestinationIp) and isnotempty(SourceIp)
| extend SourceIP = SourceIp, DestinationIP = DestinationIp
| where ipv4_is_private(DestinationIP) == false
| extend DeviceVendor = "VMConnection"
| project TimeGenerated, BytesSent, DeviceVendor
| make-series TotalBytesSent=sum(BytesSent) on TimeGenerated from startofday(ago(starttime)) to startofday(ago(endtime)) step timeframe by DeviceVendor
),
(
CommonSecurityLog
| where TimeGenerated between (startofday(ago(starttime))..startofday(ago(endtime)))
| where isnotempty(DestinationIP) and isnotempty(SourceIP)
| where ipv4_is_private(DestinationIP) == false
| project TimeGenerated, SentBytes, DeviceVendor
| make-series TotalBytesSent=sum(SentBytes) on TimeGenerated from startofday(ago(starttime)) to startofday(ago(endtime)) step timeframe by DeviceVendor
)
);
//Filter anomolies against TimeSeriesData
let TimeSeriesAlerts = materialize(TimeSeriesData
| extend (anomalies, score, baseline) = series_decompose_anomalies(TotalBytesSent, scorethreshold, -1, 'linefit')
| mv-expand TotalBytesSent to typeof(double), TimeGenerated to typeof(datetime), anomalies to typeof(double),score to typeof(double), baseline to typeof(long)
| where anomalies > 0 | extend AnomalyHour = TimeGenerated
| extend TotalBytesSentinMBperHour = round(((TotalBytesSent / 1024
| Sentinel Table | Notes |
|---|---|
CommonSecurityLog | Ensure this data connector is enabled |
VMConnection | Ensure this data connector is enabled |
Scenario: Scheduled Backup to Public Cloud Storage
Description: A legitimate backup job transfers large amounts of data to a public cloud storage service (e.g., AWS S3, Azure Blob Storage) during off-peak hours.
Filter/Exclusion: Exclude traffic to known public cloud storage endpoints (e.g., s3.amazonaws.com, blob.core.windows.net) during scheduled backup windows using a not filter on the destination domain.
Scenario: Log Aggregation to Public SIEM Service
Description: The enterprise uses a public SIEM service (e.g., Splunk Cloud, ELK Stack hosted on AWS) to aggregate logs from distributed systems.
Filter/Exclusion: Exclude traffic to the SIEM service’s public endpoints (e.g., splunkcloud.com, elasticsearch.amazonaws.com) using a destination_ip or destination_domain filter.
Scenario: Software Update Distribution to Remote Offices
Description: A centralized patch management system (e.g., Microsoft Endpoint Manager, SCCM) pushes updates to remote offices over the public internet.
Filter/Exclusion: Exclude traffic from known update servers (e.g., update.microsoft.com, download.windowsupdate.com) using a source_ip or source_domain filter.
Scenario: Data Transfer for Cloud-Based Collaboration Tools
Description: Employees use cloud-based collaboration tools (e.g., Microsoft Teams, Zoom, Google Workspace) which may transfer large data payloads over the public internet.
Filter/Exclusion: Exclude traffic to collaboration tool endpoints (e.g., teams.microsoft.com, meet.google.com) using a destination_domain filter.
Scenario: Temporary Data Transfer for DevOps Pipeline
Description: A CI/CD pipeline (e.g., Jenkins, GitLab CI) temporarily transfers large files (e.g., Docker images,