{
 "cells": [
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# Static imports\n",
    "import json\n",
    "import os\n",
    "from urllib.parse import urlparse\n"
   ],
   "id": "34df6c94-68fa-432d-a7cb-104b0639d708",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "9b6356a0-23b1-431c-a7c9-4ebcdd7cecb3",
   "metadata": {},
   "source": [
    "# Input definition\n",
    "sets_to_analyse = [\"/tmp/scanning-results-partial-leaks/\"]\n",
    "sets_to_compare = []\n",
    "\n",
    "data = {}\n",
    "\n",
    "for set_to_analyse in sets_to_analyse:\n",
    "    data[set_to_analyse] = {}"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "36678fe5-cf8e-4c44-bd18-6b420555fd5b",
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "source": [
    "\n",
    "import time\n",
    "from IPython.core.display import Markdown\n",
    "# Analyse files\n",
    "\n",
    "from ipywidgets import IntProgress\n",
    "\n",
    "\n",
    "def extract_lreqs():\n",
    "    analysis_result = json_data['analysis_result']\n",
    "    if \"found_sso_requests\" in analysis_result:\n",
    "        for found_sso_request in analysis_result['found_sso_requests']:\n",
    "            # Falsely classified generic wechat requests\n",
    "            if found_sso_request['idp'] == \"GENERIC\" and (found_sso_request['lreq'].startswith(\"https://open.weixin.qq.com/connect/oauth2/authorize\") or found_sso_request['lreq'].startswith(\"https://open.work.weixin.qq.com/wwopen/sso/qrConnect\")):\n",
    "                found_sso_request['idp'] = \"WECHAT\"\n",
    "            found_sso_request[\"page\"] = analysis_result['page']\n",
    "            found_sso_request[\"tranco_id\"] = json_data['analysis_config']['tranco_id']\n",
    "            data[set_to_analyse]['sso_requests'].append(found_sso_request)\n",
    "\n",
    "\n",
    "def extract_errors():\n",
    "    if \"thrown_exception\" not in json_data['analysis_result']:\n",
    "        data[set_to_analyse]['errors'].append(\n",
    "            {\"page\": json_data['analysis_config']['page'], \"exception\": json_data['analysis_result']['exception']})\n",
    "    elif json_data['analysis_result']['thrown_exception'] is not None:\n",
    "        data[set_to_analyse]['errors'].append({\"page\": json_data['analysis_config']['page'],\n",
    "                                               \"exception\": json_data['analysis_result']['thrown_exception']})\n",
    "\n",
    "\n",
    "def extract_timings():\n",
    "    if 'duration' in json_data['analysis_result']:\n",
    "        data[set_to_analyse]['timings']['all_durs'].append(json_data['analysis_result']['duration'])\n",
    "        if json_data['analysis_result'][\"thrown_exception\"] is None:\n",
    "            data[set_to_analyse]['timings']['all_durs_wo_err'].append(json_data['analysis_result']['duration'])\n",
    "\n",
    "\n",
    "def find_json_files(directory):\n",
    "    json_files = []\n",
    "    for root, dirs, files in os.walk(directory):\n",
    "        for file in files:\n",
    "            if file.endswith('.json'):\n",
    "                json_files.append(os.path.join(root, file))\n",
    "    return json_files\n",
    "\n",
    "\n",
    "for set_to_analyse in sets_to_analyse:\n",
    "    start_time = time.time()\n",
    "    files_to_analyse = find_json_files(set_to_analyse)\n",
    "    data[set_to_analyse]['sso_requests'] = []\n",
    "    data[set_to_analyse]['errors'] = []\n",
    "    data[set_to_analyse]['timings'] = {\"all_durs\": [], \"all_durs_wo_err\": []}\n",
    "    files_length = len(files_to_analyse)\n",
    "    data[set_to_analyse]['count_of_files'] = files_length\n",
    "    progress = IntProgress(min=0, max=files_length)  # instantiate the bar\n",
    "    display(Markdown(f\"## Analysing %s\" % set_to_analyse))\n",
    "    display(progress)\n",
    "    count = 0\n",
    "    for f in files_to_analyse:\n",
    "        count += 1\n",
    "        if count % 100 == 0:\n",
    "            progress.value = count\n",
    "        with open(f) as json_file:\n",
    "            json_data = json.load(json_file)\n",
    "            extract_lreqs()\n",
    "            extract_errors()\n",
    "            extract_timings()\n",
    "\n",
    "    print(f\"Finished analysis for %s in %s\" % (set_to_analyse, (time.time() - start_time)))\n"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "64c196d5-212f-4812-8f93-34de5892c59e",
   "metadata": {},
   "source": [
    "from statistics import mean\n",
    "from tldextract import tldextract\n",
    "\n",
    "\n",
    "# Results\n",
    "def get_stats_of_idps(reqs: []):\n",
    "    stats = {}\n",
    "    for req in reqs:\n",
    "        if req['idp'] not in stats:\n",
    "            stats[req['idp']] = {\"count\": 0, \"only_after_interaction\": 0}\n",
    "        stats[req['idp']][\"count\"] += 1\n",
    "        stats[req['idp']][\"only_after_interaction\"] += 1 if req['user_action_performed'] else 0\n",
    "        if req['idp'] == \"GENERIC\":\n",
    "            if \"details\" not in stats['GENERIC']:\n",
    "                stats[\"GENERIC\"]['details'] = {}\n",
    "            tld_info = tldextract.extract(urlparse(req[\"lreq\"]).netloc)\n",
    "            if tld_info.registered_domain not in stats['GENERIC']['details']:\n",
    "                stats['GENERIC']['details'][tld_info.registered_domain] = {\"count\": 0, \"sub_domain_counts\": {}}\n",
    "            stats['GENERIC']['details'][tld_info.registered_domain][\"count\"] += 1\n",
    "            if tld_info.fqdn not in stats['GENERIC']['details'][tld_info.registered_domain][\"sub_domain_counts\"]:\n",
    "                stats['GENERIC']['details'][tld_info.registered_domain][\"sub_domain_counts\"][tld_info.fqdn] = 0\n",
    "            stats['GENERIC']['details'][tld_info.registered_domain][\"sub_domain_counts\"][tld_info.fqdn] += 1\n",
    "    return stats\n",
    "\n",
    "\n",
    "def group_errors(errors: []):\n",
    "    groups = [\n",
    "        {\"name\": \"Timeout\", \"pattern\": \"Timeout 30000ms exceeded\", \"errors\": []},\n",
    "        {\"name\": \"Net errors\", \"pattern\": \"net::\", \"errors\": []},\n",
    "        {\"name\": \"Unknown errors\", \"pattern\": \"\", \"errors\": []}\n",
    "    ]\n",
    "    for error in errors:\n",
    "        added = False\n",
    "        for group in groups:\n",
    "            if not added and group['pattern'] in error['exception']:\n",
    "                group['errors'].append(error)\n",
    "                added = True\n",
    "        if not added:\n",
    "            print(\"Should not happen!\")\n",
    "    return groups\n",
    "\n",
    "\n",
    "def group_timings(timings):\n",
    "    return (min(timings), max(timings), mean(timings),\n",
    "            len([dur for dur in timings if dur < 10]),\n",
    "            len([dur for dur in timings if 10 <= dur < 20]),\n",
    "            len([dur for dur in timings if 20 <= dur < 30]),\n",
    "            len([dur for dur in timings if 30 <= dur < 40]),\n",
    "            len([dur for dur in timings if 40 <= dur < 50]),\n",
    "            len([dur for dur in timings if 50 <= dur < 60]),\n",
    "            len([dur for dur in timings if 60 <= dur < 90]),\n",
    "            len([dur for dur in timings if 90 <= dur < 120]),\n",
    "            len([dur for dur in timings if 120 <= dur < 180]),\n",
    "            len([dur for dur in timings if 180 <= dur < 240]),\n",
    "            len([dur for dur in timings if 240 <= dur < 300]),\n",
    "            len([dur for dur in timings if 300 <= dur]),\n",
    "            )\n",
    "\n",
    "\n",
    "def format_timings(grouped_timings):\n",
    "    return f\"\\n- Minimum time: %s seconds\\n- Maximum time: %s seconds\\n- Average time: %s seconds\\n- Count <10: %s\\n- Count <20: %s\\n- Count <30: %s\\n- Count <40: %s\\n- Count <50: %s\\n- Count <60: %s\\n- Count <90: %s\\n- Count <120: %s\\n- Count <180: %s\\n- Count <240: %s\\n- Count <300: %s\\n- Count >=300: %s\" % grouped_timings\n",
    "\n",
    "\n",
    "print(\"# Results\")\n",
    "for set_to_analyse in sets_to_analyse:\n",
    "    set_instance = data[set_to_analyse]\n",
    "    print(\"## %s\" % set_to_analyse)\n",
    "    print(\"\\n**Timings**\")\n",
    "    #print(format_timings(group_timings(set_instance['timings']['all_durs'])))\n",
    "    print(\"\\n**Timings /wo Errors**\")\n",
    "    #print(format_timings(group_timings(set_instance['timings']['all_durs_wo_err'])))\n",
    "    unique_websites_with_pleaks = {}\n",
    "    for request in set_instance['sso_requests']:\n",
    "        if request['page'] not in unique_websites_with_pleaks:\n",
    "            unique_websites_with_pleaks[request['page']] = []\n",
    "        unique_websites_with_pleaks[request['page']].append(request['idp'])\n",
    "    print(\n",
    "        f\"\\n**Generic Infos**\\n\\n- Total files analysed: %s\\n- Found login requests: %s\\n- Unique pages: %s\\n- Total errors %s\" % (\n",
    "            set_instance['count_of_files'], len(set_instance['sso_requests']), len(unique_websites_with_pleaks),\n",
    "            len(set_instance['errors'])))\n",
    "    print(\"\\n**Multiple Leak Statistic**\")\n",
    "    p_with_m_leaks = [unique_websites_with_pleaks[ml] for ml in unique_websites_with_pleaks if\n",
    "                      len(unique_websites_with_pleaks[ml]) > 1]\n",
    "    sorted_mleaks = {}\n",
    "    for ml in p_with_m_leaks:\n",
    "        combination = str(sorted(ml))\n",
    "        if combination not in sorted_mleaks:\n",
    "            sorted_mleaks[combination] = 0\n",
    "        sorted_mleaks[combination] += 1\n",
    "    print(str(sorted_mleaks))\n",
    "    print(str())\n",
    "    print(\"\\n**IdP Statistics**\")\n",
    "    idp_stats = get_stats_of_idps(data[set_to_analyse]['sso_requests'])\n",
    "    md = \"\"\n",
    "    for idp in idp_stats:\n",
    "        md += f\"- %s: %s (%s after interaction)\\n\" % (\n",
    "            idp.title(), idp_stats[idp]['count'], idp_stats[idp]['only_after_interaction'])\n",
    "        if idp == \"FACEBOOK\":\n",
    "            xoauth_status = [req for req in data[set_to_analyse]['sso_requests'] if\n",
    "                             req['idp'] == idp and \"x/oauth/status\" in req['lreq'].lower()]\n",
    "            not_xoauth_status = [req for req in data[set_to_analyse]['sso_requests'] if\n",
    "                                 req['idp'] == idp and \"x/oauth/status\" not in req['lreq'].lower()]\n",
    "            md += f\"  - /x/oauth/status : %s\\n  - other: %s\\n\" % (len(xoauth_status), len(not_xoauth_status))\n",
    "            for s in not_xoauth_status:\n",
    "                md += f\"    - `%s`\\n\" % s[\"page\"].replace(\"https://\", \"\")\n",
    "        elif idp == \"GOOGLE\":\n",
    "            gsi_status = [req for req in data[set_to_analyse]['sso_requests'] if\n",
    "                          req['idp'] == idp and \"/gsi/status\" in req['lreq'].lower()]\n",
    "            gsi_iframe_select_status = [req for req in data[set_to_analyse]['sso_requests'] if\n",
    "                                        req['idp'] == idp and \"/gsi/iframe/select\" in req['lreq'].lower()]\n",
    "            other_status = [req for req in data[set_to_analyse]['sso_requests'] if\n",
    "                            req['idp'] == idp and \"/gsi/status\" not in req[\n",
    "                                'lreq'].lower() and \"/gsi/iframe/select\" not in req['lreq'].lower()]\n",
    "            md += f\"  - /gsi/status: %s\\n  - /gsi/iframe/select: %s\\n  - other: %s\\n\" % (\n",
    "                len(gsi_status), len(gsi_iframe_select_status), len(other_status))\n",
    "            for s in other_status:\n",
    "                md += f\"    - `%s`\\n\" % s[\"page\"].replace(\"https://\", \"\")\n",
    "        elif idp == \"GENERIC\":\n",
    "            idps_with_more_than_one_sp = [g_idp for g_idp in idp_stats[idp]['details'] if idp_stats[idp]['details'][g_idp]['count'] > 1]\n",
    "            md += \"  - Total generic IdPs: %s\\n\" % len(idp_stats[idp]['details'])\n",
    "            md += \"  - IdPs with more than one SP: %s\\n\" % len(idps_with_more_than_one_sp)\n",
    "        elif idp != \"GENERIC\":\n",
    "            login_types = {}\n",
    "            reqs = [req for req in data[set_to_analyse]['sso_requests'] if req['idp'] == idp]\n",
    "            for req in reqs:\n",
    "                login_type = urlparse(req['lreq']).netloc + urlparse(req['lreq']).path\n",
    "                if login_type not in login_types:\n",
    "                    login_types[login_type] = 0\n",
    "                login_types[login_type] += 1\n",
    "            others = 0\n",
    "            for t in login_types:\n",
    "                if login_types[t] > 1:\n",
    "                    md += \"  - %s: %s\\n\" % (t, login_types[t])\n",
    "                else:\n",
    "                    others += 1\n",
    "            if others > 0:\n",
    "                md += \"  - %s: %s\\n\" % (\"Others\", others)\n",
    "    print(md)\n",
    "    if \"GENERIC\" in idp_stats:\n",
    "        print(\"**Generic Providers**\")\n",
    "        print(\"```\")\n",
    "        limit = 10\n",
    "        for index, info in enumerate(sorted(idp_stats[\"GENERIC\"][\"details\"], key=lambda key: idp_stats[\"GENERIC\"][\"details\"][key]['count'],\n",
    "                           reverse=True)):\n",
    "            if index > limit:\n",
    "                break\n",
    "            print(f\"%s: %s\" % (info, idp_stats[\"GENERIC\"][\"details\"][info][\"count\"]))\n",
    "            for sub in idp_stats[\"GENERIC\"][\"details\"][info]['sub_domain_counts']:\n",
    "                if sub != info:\n",
    "                    print(f\"\\t- %s: %s\" % (sub, idp_stats[\"GENERIC\"][\"details\"][info]['sub_domain_counts'][sub]))\n",
    "        print(\"```\")\n",
    "    print(\"**Errors:**\")\n",
    "    markdown = \"\"\n",
    "    unknown_errors = None\n",
    "    for group in group_errors(data[set_to_analyse][\"errors\"]):\n",
    "        if group[\"name\"] == 'Unknown errors':\n",
    "            unknown_errors = group[\"errors\"]\n",
    "        markdown += f\"\\n* %s: %s\" % (group['name'], len(group['errors']))\n",
    "        if group[\"name\"] == \"Net errors\":\n",
    "            net_groups = {}\n",
    "            for err in group[\"errors\"]:\n",
    "                err_name = err[\"exception\"].split(\" \")[0]\n",
    "                if err_name not in net_groups:\n",
    "                    net_groups[err_name] = 0\n",
    "                net_groups[err_name] += 1\n",
    "            for net_group in sorted(net_groups, key=net_groups.get, reverse=True):\n",
    "                markdown += f\"\\n  * %s: %s\" % (net_group, net_groups[net_group])\n",
    "\n",
    "    print(markdown)\n",
    "    print(\"```\")\n",
    "    err = [str(e) + \"\\n\" for e in unknown_errors]\n",
    "    print(\"\".join(err))\n",
    "    print(\"```\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "4be8772ef7605f0b",
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "source": [
    "import sys\n",
    "from math import floor\n",
    "import pandas\n",
    "import matplotlib  # MUST BE IMPORTED!\n",
    "\n",
    "\n",
    "def create_line_chard_found_sso_req(steps, analysis_range=(1, 1000000), overwrite_xticks=None):\n",
    "    if (analysis_range[0] - 1) % steps != 0 or analysis_range[1] % steps != 0:\n",
    "        raise Exception(\n",
    "            \"Analysis range bounds must be divisible by stepsize. Otherwise, the diagram will be distorted!\")\n",
    "    groups = []\n",
    "    group_count = floor(analysis_range[1] / steps) + (0 if analysis_range[1] % steps == 0 else 1)\n",
    "    for i in range(0, group_count):\n",
    "        if analysis_range[0] < i * steps >= analysis_range[1]:\n",
    "            continue\n",
    "        groups.append(0)\n",
    "    for dataset in data[set_to_analyse][\"sso_requests\"]:\n",
    "        t_id = dataset[\"tranco_id\"]\n",
    "        if analysis_range[0] < t_id > analysis_range[1]:\n",
    "            continue\n",
    "        group = floor(dataset[\"tranco_id\"] / steps)\n",
    "        groups[group] += 1\n",
    "    stepnames = [(f\"%sK\" if (c + 1) * steps < 1000000 else f\"%sM\") % int(\n",
    "        (c + 1) * steps / (1000 if (c + 1) * steps < 1000000 else 1000000)) for c in range(len(groups))]\n",
    "    df = pandas.DataFrame({'Found login requests': groups}, index=stepnames)\n",
    "    lower_string = str(analysis_range[0]) if analysis_range[0] < 1000 else f\"%sK\" % int(analysis_range[0] / 1000) if \\\n",
    "        analysis_range[0] < 1000000 else f\"%sM\" % int(analysis_range[0] / 1000000)\n",
    "    upper_string = str(analysis_range[1]) if analysis_range[1] < 1000 else f\"%sK\" % int(analysis_range[1] / 1000) if \\\n",
    "        analysis_range[1] < 1000000 else f\"%sM\" % int(analysis_range[1] / 1000000)\n",
    "    df.plot(title=f\"%sK Steps - Range: %s - %s\" % (int(steps / 1000), lower_string, upper_string),\n",
    "            xticks=overwrite_xticks)\n",
    "    print(groups)\n",
    "\n",
    "\n",
    "# Verläufe \n",
    "display(Markdown(\"# Verläufe\"))\n",
    "for set_to_analyse in sets_to_analyse:\n",
    "    create_line_chard_found_sso_req(100000, overwrite_xticks=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])\n",
    "    #create_line_chard_found_sso_req(10000, overwrite_xticks=[0,9,19,29,39,49,59,69,79,89,99])\n",
    "    create_line_chard_found_sso_req(10000, analysis_range=(1, 100000))\n"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "8d667b42f35eddf",
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "source": [
    "# Export Pages with hidden lreqs\n",
    "display(Markdown(\"# Pages with Requests\"))\n",
    "for set_to_analyse in sets_to_analyse:\n",
    "    display(Markdown(f\"## %s\" % set_to_analyse))\n",
    "    for result in data[set_to_analyse][\"sso_requests\"]:\n",
    "        print(f\"%s,%s\" % (result[\"page\"], result[\"idp\"]))"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "a9b7faa38e761f6b",
   "metadata": {},
   "source": [
    "from pathlib import Path\n",
    "import csv\n",
    "\n",
    "export_csv = True\n",
    "if export_csv:\n",
    "    output_file = \"/tmp/exported_lreq{}.csv\"\n",
    "    for set_to_analyse in sets_to_analyse:\n",
    "        if len(sets_to_analyse) > 1:\n",
    "            file_name = output_file.replace(\"{}\", set_to_analyse.replace(\"./\", \"\").replace(\"/\", \"-\"))\n",
    "        else:\n",
    "            file_name = \"/tmp/exported_lreq.csv\"\n",
    "        with open(file_name, \"w\", newline='') as f:\n",
    "            csv_writer = csv.writer(f)\n",
    "            for result in data[set_to_analyse][\"sso_requests\"]:\n",
    "                csv_writer.writerow([result[\"page\"], result[\"idp\"], result[\"lreq\"]])\n",
    "            Path(file_name).parent.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "        print(f\"Finished exporting %s\" % file_name)\n"
   ],
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}