Apache Spark Distributed Application, using PySpark in Google Colab.
{"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.1"},"colab":{"name":"Week11_ClassExercise.ipynb","provenance":[],"toc_visible":true}},"cells":[{"cell_type":"markdown","metadata":{"id":"gNiCJIYKdWvU"},"source":["### Week 11 Class Exercise :: Apache Spark Practice"]},{"cell_type":"markdown","metadata":{"id":"IKbSBoTeeLYJ"},"source":["##### **Install PySpark (Python Spark API)**"]},{"cell_type":"code","metadata":{"id":"EZRiDKFqd9UL","executionInfo":{"status":"ok","timestamp":1617079906804,"user_tz":240,"elapsed":279,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}}},"source":["#!pip install pyspark"],"execution_count":4,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"RoX_b9-UdtAc"},"source":["#### **Initiate and configure Spark Context**"]},{"cell_type":"code","metadata":{"id":"KH8ftfaUdWvY","executionInfo":{"status":"ok","timestamp":1617083511785,"user_tz":240,"elapsed":5823,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}}},"source":["from pyspark.conf import SparkConf\n","from pyspark import SparkContext\n","from pyspark.sql import SQLContext\n","\n","sc = SparkContext() \n","config = sc.getConf()\n","config.set('spark.cores.max','4')\n","config.set('spark.executor.memory', '8G')\n","config.set('spark.driver.maxResultSize', '8g')\n","config.set('spark.kryoserializer.buffer.max', '512m')\n","config.set(\"spark.driver.cores\", \"4\")\n","\n","sc.stop()"],"execution_count":1,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"MqS1UzyWdWvZ","executionInfo":{"status":"ok","timestamp":1617083512765,"user_tz":240,"elapsed":972,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"5776e78d-e84d-48fe-e3eb-7f3205d9af5c"},"source":["sc = SparkContext(conf = config) \n","sqlContext = SQLContext(sc)\n","print(\"Using Apache Spark Version\", sc.version)"],"execution_count":2,"outputs":[{"output_type":"stream","text":["Using Apache Spark Version 3.1.1\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"sdDmZgYsdWvZ"},"source":["#### **Read a large CSV file into Spark DataFrame**"]},{"cell_type":"code","metadata":{"id":"JQys4iZRdWva","executionInfo":{"status":"ok","timestamp":1617083512765,"user_tz":240,"elapsed":966,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}}},"source":["cb_file = \"crunchbase_odm_orgs.csv\""],"execution_count":3,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"n4iGk5j6dWva","executionInfo":{"status":"ok","timestamp":1617083090184,"user_tz":240,"elapsed":21199,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"29f07fcf-5d5a-4cea-cede-eb87c4560531"},"source":["cb_sdf = sqlContext.read.option(\"header\", \"true\").option(\"delimiter\", \",\").option(\"inferSchema\", \"true\").csv(cb_file)\n","cb_sdf.count()"],"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":["1127735"]},"metadata":{"tags":[]},"execution_count":4}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"1t1-K8jCdWva","executionInfo":{"status":"ok","timestamp":1617083530000,"user_tz":240,"elapsed":18197,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"97c5f42e-109d-40c3-bfbf-24e6bff05603"},"source":["cb_sdf = sqlContext.read.format(\"csv\") \\\n"," .options(header='true', inferschema='true', treatEmptyValuesAsNulls='true') \\\n"," .load(cb_file)\n","cb_sdf.count()"],"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":["1127735"]},"metadata":{"tags":[]},"execution_count":4}]},{"cell_type":"markdown","metadata":{"id":"jq4_--W3q2TL"},"source":["#### **Select columns into RDD, format them into str, and back to dataframe**"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"KaxkS2vmmBUX","executionInfo":{"status":"ok","timestamp":1617083558246,"user_tz":240,"elapsed":413,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"422fa002-b893-4d68-d0e2-537913743a05"},"source":["cb_sdf.columns"],"execution_count":5,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['uuid',\n"," 'name',\n"," 'type',\n"," 'primary_role',\n"," 'cb_url',\n"," 'domain',\n"," 'homepage_url',\n"," 'logo_url',\n"," 'facebook_url',\n"," 'twitter_url',\n"," 'linkedin_url',\n"," 'combined_stock_symbols',\n"," 'city',\n"," 'region',\n"," 'country_code',\n"," 'short_description']"]},"metadata":{"tags":[]},"execution_count":5}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ekfYcmZhjTy3","executionInfo":{"status":"ok","timestamp":1617083851331,"user_tz":240,"elapsed":698,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"23073f62-929e-43ae-a88d-7f3d999932da"},"source":["cb_rdd = cb_sdf.select('*').rdd.map(lambda row: [str(row[i]) for i in ['uuid','name','domain']])\n","cb_sdf2 = sqlContext.createDataFrame(cb_rdd,['uuid','name','domain'])\n","cb_sdf2.show(10)"],"execution_count":10,"outputs":[{"output_type":"stream","text":["+--------------------+--------------------+-------------+\n","| uuid| name| domain|\n","+--------------------+--------------------+-------------+\n","|e1393508-30ea-8a3...| Wetpaint| wetpaint.com|\n","|bf4d7b0e-b34d-2fd...| Zoho| zoho.com|\n","|5f2b40b8-d1b3-d32...| Digg| digg.com|\n","|df662812-7f97-0b4...| Facebook| facebook.com|\n","|b08efc27-da40-505...| Accel| accel.com|\n","|60485007-8856-bba...| Omnidrive|omnidrive.com|\n","|4111dc8b-c0df-2d2...| Geni| geni.com|\n","|180ebf67-68d0-231...| Flektor| flektor.com|\n","|d70777cc-14bd-241...|Fox Interactive M...| fox.com|\n","|5da6106f-0d27-0d3...| Twitter| twitter.com|\n","+--------------------+--------------------+-------------+\n","only showing top 10 rows\n","\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"h_klmQmmrlGq"},"source":["#### **Select a range of rows between preset indexes**"]},{"cell_type":"code","metadata":{"id":"JXGbxa5KshQe","executionInfo":{"status":"ok","timestamp":1617084096536,"user_tz":240,"elapsed":256,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}}},"source":["from pyspark.sql.functions import monotonically_increasing_id\n","\n","cb_sdf3 = cb_sdf2.select(\"*\").withColumn(\"index\", monotonically_increasing_id())"],"execution_count":15,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tWnNHnYtrs-f","executionInfo":{"status":"ok","timestamp":1617084156471,"user_tz":240,"elapsed":28539,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"66f39728-1a5e-4ec5-8bed-bd972eed5db0"},"source":["import pyspark.sql.functions as F\n","start_row = 3\n","end_row = 7\n","cb_sdf3.where((F.col('index') > start_row) & (F.col('index') < end_row)).show()"],"execution_count":16,"outputs":[{"output_type":"stream","text":["+--------------------+---------+-------------+-----+\n","| uuid| name| domain|index|\n","+--------------------+---------+-------------+-----+\n","|b08efc27-da40-505...| Accel| accel.com| 4|\n","|60485007-8856-bba...|Omnidrive|omnidrive.com| 5|\n","|4111dc8b-c0df-2d2...| Geni| geni.com| 6|\n","+--------------------+---------+-------------+-----+\n","\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"jD2SWGZSdWva"},"source":["#### **Apply transformations to name field**"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"wJdbwgC8dWvb","executionInfo":{"status":"ok","timestamp":1617084303272,"user_tz":240,"elapsed":610,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"79fe4229-8479-4b5c-daaf-84a4150b33b0"},"source":["from pyspark.sql.functions import initcap\n","cb_sdf3 = cb_sdf3.withColumn('name_lower', initcap('name'))\n","cb_sdf3['name','name_lower'].show()"],"execution_count":17,"outputs":[{"output_type":"stream","text":["+--------------------+--------------------+\n","| name| name_lower|\n","+--------------------+--------------------+\n","| Wetpaint| Wetpaint|\n","| Zoho| Zoho|\n","| Digg| Digg|\n","| Facebook| Facebook|\n","| Accel| Accel|\n","| Omnidrive| Omnidrive|\n","| Geni| Geni|\n","| Flektor| Flektor|\n","|Fox Interactive M...|Fox Interactive M...|\n","| Twitter| Twitter|\n","| StumbleUpon| Stumbleupon|\n","| Gizmoz| Gizmoz|\n","| Scribd| Scribd|\n","| Slacker| Slacker|\n","| Lala| Lala|\n","| Helio| Helio|\n","| eBay| Ebay|\n","| Wis.dm| Wis.dm|\n","| MeetMoi| Meetmoi|\n","| Postini| Postini|\n","+--------------------+--------------------+\n","only showing top 20 rows\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"aA_tFBFgxvm7","executionInfo":{"status":"ok","timestamp":1617085150919,"user_tz":240,"elapsed":229,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}}},"source":["def upper_case(entity_name):\n"," return entity_name.upper()"],"execution_count":21,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"InXbAxm4xqUo","executionInfo":{"status":"ok","timestamp":1617085161493,"user_tz":240,"elapsed":1197,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"a5889269-687f-4fea-eeb7-6ab360bd71a9"},"source":["from pyspark.sql.types import StringType\n","from pyspark.sql.functions import udf\n","\n","spark_udf = udf(upper_case, StringType())\n","cb_sdf3 = cb_sdf3.withColumn('name_upper',spark_udf('name'))\n","cb_sdf3['name','name_lower','name_upper'].show(20, truncate=False)"],"execution_count":22,"outputs":[{"output_type":"stream","text":["+---------------------+---------------------+---------------------+\n","|name |name_lower |name_upper |\n","+---------------------+---------------------+---------------------+\n","|Wetpaint |Wetpaint |WETPAINT |\n","|Zoho |Zoho |ZOHO |\n","|Digg |Digg |DIGG |\n","|Facebook |Facebook |FACEBOOK |\n","|Accel |Accel |ACCEL |\n","|Omnidrive |Omnidrive |OMNIDRIVE |\n","|Geni |Geni |GENI |\n","|Flektor |Flektor |FLEKTOR |\n","|Fox Interactive Media|Fox Interactive Media|FOX INTERACTIVE MEDIA|\n","|Twitter |Twitter |TWITTER |\n","|StumbleUpon |Stumbleupon |STUMBLEUPON |\n","|Gizmoz |Gizmoz |GIZMOZ |\n","|Scribd |Scribd |SCRIBD |\n","|Slacker |Slacker |SLACKER |\n","|Lala |Lala |LALA |\n","|Helio |Helio |HELIO |\n","|eBay |Ebay |EBAY |\n","|Wis.dm |Wis.dm |WIS.DM |\n","|MeetMoi |Meetmoi |MEETMOI |\n","|Postini |Postini |POSTINI |\n","+---------------------+---------------------+---------------------+\n","only showing top 20 rows\n","\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"1wIFLR04dWvb"},"source":["#### **Find duplicates by name**"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"q1vn4ZABdWvb","executionInfo":{"status":"ok","timestamp":1617084553723,"user_tz":240,"elapsed":70969,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"48c0ce14-28ff-41b1-ec65-a85431756616"},"source":["from pyspark.sql.functions import *\n","\n","def count_col_dups(df, col_name):\n"," df_dupes = df.groupBy(col_name) \\\n"," .agg(count(col_name).alias(\"count\")) \\\n"," .filter(col('count') > 1) \n"," return df_dupes\n","\n","dupes = count_col_dups(cb_sdf3, 'name_lower')\n","dupes.show(20, truncate=False)\n","dupes.count()\n","#dupe_list = dupes.select(col(\"name_lower\")).rdd.flatMap(lambda x: x).collect()\n","#dupe_list"],"execution_count":18,"outputs":[{"output_type":"stream","text":["+-------------------------+-----+\n","|name_lower |count|\n","+-------------------------+-----+\n","|Prospection |2 |\n","|Adlib |4 |\n","|Xceed |2 |\n","|Proteinsimple |2 |\n","|Ember |3 |\n","|Nds |4 |\n","|Magicjack |2 |\n","|Sword |2 |\n","|Michelle Kaufmann Designs|2 |\n","|Csl |6 |\n","|Exos |5 |\n","|Tetrax |2 |\n","|Ad Factory |2 |\n","|Hint |3 |\n","|Enliven |2 |\n","|Gtx |4 |\n","|Breezy |2 |\n","|Bonapp |2 |\n","|Nebula |4 |\n","|Clerky |2 |\n","+-------------------------+-----+\n","only showing top 20 rows\n","\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/plain":["24231"]},"metadata":{"tags":[]},"execution_count":18}]},{"cell_type":"markdown","metadata":{"id":"RC1qT6oydWvb"},"source":["##### Extract a portion of the dataframe that contains duplicates"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"AN6uhACVdWvb","executionInfo":{"status":"ok","timestamp":1617084724498,"user_tz":240,"elapsed":30569,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"72a2038b-5270-4ae5-fd42-600edd21de9e"},"source":["import pyspark.sql.functions as f\n","from pyspark.sql import Window\n","\n","w = Window.partitionBy('name_lower')\n","dups_df = cb_sdf3.select('*', f.count('name_lower').over(w).alias('dupeCount')).where('dupeCount > 1') \n","dups_df.show()"],"execution_count":19,"outputs":[{"output_type":"stream","text":["+--------------------+-----------------+--------------------+-----------+-----------------+---------+\n","| uuid| name| domain| index| name_lower|dupeCount|\n","+--------------------+-----------------+--------------------+-----------+-----------------+---------+\n","|2bb2a569-a90c-44f...| ACF| acf.fr|17180137559| Acf| 2|\n","|49716638-3813-4d5...| Acf| acf.ua|25769863150| Acf| 2|\n","|33df8109-0191-467...| Ad Factory| adfactory.com| 38892| Ad Factory| 2|\n","|db06c5fc-c2cf-401...| Ad Factory| adfactoryonline.com|25769986022| Ad Factory| 2|\n","|dc1c9a35-5256-a2b...| adlib| adlibjapan.jp| 6829| Adlib| 4|\n","|730c1052-f528-490...| ADLIB|adlib-recruitment...|17180087793| Adlib| 4|\n","|9b8afe39-8298-4c4...| Adlib| adlibinc.net|25769985435| Adlib| 4|\n","|3960be5a-1f3f-433...| ADLIB| adlib.com|25770001124| Adlib| 4|\n","|9d00f495-979b-41d...|Adult SEO Experts| adultseoexperts.com| 140612|Adult Seo Experts| 2|\n","|d50aa84b-3dd1-4aa...|Adult SEO Experts| adultseoexperts.org| 8590164992|Adult Seo Experts| 2|\n","|4299282e-ce62-434...| AHC| ahccasting.com|25770034311| Ahc| 2|\n","|104dd329-f469-408...| AHC| ahc.co.kr|34359755439| Ahc| 2|\n","|b0b8e18f-0b1c-9ed...| AirIQ| airiq.com| 164410| Airiq| 2|\n","|e5e71549-951a-4dd...| AirIQ| airiq.co.uk|17179962506| Airiq| 2|\n","|5ea92e49-b756-4b9...| Alliance Systems| alliancesystems.com|17180082372| Alliance Systems| 2|\n","|105b4073-04ea-4a6...| Alliance Systems|alliance-systems....|17180085207| Alliance Systems| 2|\n","|65e98ae7-1714-4e2...| Amazing| amazing.com|17179967834| Amazing| 5|\n","|5e6bddd5-a80e-4b9...| Amazing| amazing.nl|25769816750| Amazing| 5|\n","|d7100ad2-8266-412...| Amazing| amazing.pt|25769952795| Amazing| 5|\n","|00f4f435-9620-4c3...| Amazing| amazingsys.com|25769970982| Amazing| 5|\n","+--------------------+-----------------+--------------------+-----------+-----------------+---------+\n","only showing top 20 rows\n","\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"9QFKzFELdWvc"},"source":["##### Write to a CSV file"]},{"cell_type":"code","metadata":{"id":"zp52DRETdWvc","executionInfo":{"status":"ok","timestamp":1617084849623,"user_tz":240,"elapsed":33988,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}}},"source":["dups_df.coalesce(1).write.format(\"csv\").option(\"header\", \"true\").save('cb_dupes.csv')"],"execution_count":20,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"sNikK42IzziY"},"source":["### **Using Spark MLLib to train Word2vec model on descriptions**"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"zM4kQSSk0TUz","executionInfo":{"status":"ok","timestamp":1617085906818,"user_tz":240,"elapsed":398,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"76a81359-93c2-4a39-a2d3-da2da288c7b3"},"source":["from pyspark.sql.functions import col\n","cb_sdf.select(col(\"uuid\"),col(\"name\"),col(\"short_description\")).show()"],"execution_count":26,"outputs":[{"output_type":"stream","text":["+--------------------+--------------------+--------------------+\n","| uuid| name| short_description|\n","+--------------------+--------------------+--------------------+\n","|e1393508-30ea-8a3...| Wetpaint|Wetpaint offers a...|\n","|bf4d7b0e-b34d-2fd...| Zoho|Zoho offers a sui...|\n","|5f2b40b8-d1b3-d32...| Digg|Digg Inc. operate...|\n","|df662812-7f97-0b4...| Facebook|Facebook is an on...|\n","|b08efc27-da40-505...| Accel|Accel is an early...|\n","|60485007-8856-bba...| Omnidrive|Omnidrive provide...|\n","|4111dc8b-c0df-2d2...| Geni|Geni is an online...|\n","|180ebf67-68d0-231...| Flektor|Flektor is a mash...|\n","|d70777cc-14bd-241...|Fox Interactive M...|Fox Interactive M...|\n","|5da6106f-0d27-0d3...| Twitter|Twitter is a soci...|\n","|3d16cb4c-911e-75c...| StumbleUpon|StumbleUpon is a ...|\n","|56b5f0c1-855f-2eb...| Gizmoz|Gizmoz develops p...|\n","|21e77067-5537-408...| Scribd|Scribd is an unli...|\n","|ea091a8c-40e0-060...| Slacker|Slacker, an inter...|\n","|59fada33-1595-de4...| Lala|Lala, an online m...|\n","|56e82b22-7466-b00...| Helio|Helio, a mobile s...|\n","|e56b0ceb-bb30-bbe...| eBay|eBay is an online...|\n","|7ac8de1f-61b5-b1e...| Wis.dm|Wis.dm is a socia...|\n","|b528bd22-cce5-c60...| MeetMoi|MeetMoi is a loca...|\n","|8cb7f859-e757-093...| Postini|Postini offers so...|\n","+--------------------+--------------------+--------------------+\n","only showing top 20 rows\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"1kecd6uedWvc","executionInfo":{"status":"ok","timestamp":1617086261845,"user_tz":240,"elapsed":616,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}}},"source":["from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec\n","regexTokFilter = RegexTokenizer(gaps = False, pattern = '\\w+', inputCol = 'short_description', outputCol = 'tokens')\n","stopwordFilter = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')"],"execution_count":28,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"hz4Sr1az17Jt","executionInfo":{"status":"ok","timestamp":1617086872652,"user_tz":240,"elapsed":4032,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"a38bce7c-9b16-4824-b92b-3e4975520716"},"source":["cb_sdf_tok = regexTokFilter.transform(cb_sdf)\n","cb_sdf_swr = stopwordFilter.transform(cb_sdf_tok)\n","cb_sdf_subset = cb_sdf_swr.limit(30000)\n","cb_sdf_subset['uuid','name','short_description','tokens','tokens_sw_removed'].show()"],"execution_count":40,"outputs":[{"output_type":"stream","text":["+--------------------+--------------------+--------------------+--------------------+--------------------+\n","| uuid| name| short_description| tokens| tokens_sw_removed|\n","+--------------------+--------------------+--------------------+--------------------+--------------------+\n","|e1393508-30ea-8a3...| Wetpaint|Wetpaint offers a...|[wetpaint, offers...|[wetpaint, offers...|\n","|bf4d7b0e-b34d-2fd...| Zoho|Zoho offers a sui...|[zoho, offers, a,...|[zoho, offers, su...|\n","|5f2b40b8-d1b3-d32...| Digg|Digg Inc. operate...|[digg, inc, opera...|[digg, inc, opera...|\n","|df662812-7f97-0b4...| Facebook|Facebook is an on...|[facebook, is, an...|[facebook, online...|\n","|b08efc27-da40-505...| Accel|Accel is an early...|[accel, is, an, e...|[accel, early, gr...|\n","|60485007-8856-bba...| Omnidrive|Omnidrive provide...|[omnidrive, provi...|[omnidrive, provi...|\n","|4111dc8b-c0df-2d2...| Geni|Geni is an online...|[geni, is, an, on...|[geni, online, co...|\n","|180ebf67-68d0-231...| Flektor|Flektor is a mash...|[flektor, is, a, ...|[flektor, mash, p...|\n","|d70777cc-14bd-241...|Fox Interactive M...|Fox Interactive M...|[fox, interactive...|[fox, interactive...|\n","|5da6106f-0d27-0d3...| Twitter|Twitter is a soci...|[twitter, is, a, ...|[twitter, social,...|\n","|3d16cb4c-911e-75c...| StumbleUpon|StumbleUpon is a ...|[stumbleupon, is,...|[stumbleupon, dis...|\n","|56b5f0c1-855f-2eb...| Gizmoz|Gizmoz develops p...|[gizmoz, develops...|[gizmoz, develops...|\n","|21e77067-5537-408...| Scribd|Scribd is an unli...|[scribd, is, an, ...|[scribd, unlimite...|\n","|ea091a8c-40e0-060...| Slacker|Slacker, an inter...|[slacker, an, int...|[slacker, interac...|\n","|59fada33-1595-de4...| Lala|Lala, an online m...|[lala, an, online...|[lala, online, mu...|\n","|56e82b22-7466-b00...| Helio|Helio, a mobile s...|[helio, a, mobile...|[helio, mobile, s...|\n","|e56b0ceb-bb30-bbe...| eBay|eBay is an online...|[ebay, is, an, on...|[ebay, online, ma...|\n","|7ac8de1f-61b5-b1e...| Wis.dm|Wis.dm is a socia...|[wis, dm, is, a, ...|[wis, dm, social,...|\n","|b528bd22-cce5-c60...| MeetMoi|MeetMoi is a loca...|[meetmoi, is, a, ...|[meetmoi, locatio...|\n","|8cb7f859-e757-093...| Postini|Postini offers so...|[postini, offers,...|[postini, offers,...|\n","+--------------------+--------------------+--------------------+--------------------+--------------------+\n","only showing top 20 rows\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"wnKQru5w5Ce4","executionInfo":{"status":"ok","timestamp":1617087083648,"user_tz":240,"elapsed":26849,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"b8f1f598-6b85-487d-e355-edb455394883"},"source":["word2vec = Word2Vec(vectorSize = 300, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')\n","model = word2vec.fit(cb_sdf_subset)\n","wordvectors = model.transform(cb_sdf_subset)\n","cb_sdf_w2v = wordvectors.select('uuid','name','short_description','wordvectors').rdd.toDF()\n","cb_sdf_w2v.show()"],"execution_count":41,"outputs":[{"output_type":"stream","text":["+--------------------+--------------------+--------------------+--------------------+\n","| uuid| name| short_description| wordvectors|\n","+--------------------+--------------------+--------------------+--------------------+\n","|e1393508-30ea-8a3...| Wetpaint|Wetpaint offers a...|[0.00155978696420...|\n","|bf4d7b0e-b34d-2fd...| Zoho|Zoho offers a sui...|[0.00142027450991...|\n","|5f2b40b8-d1b3-d32...| Digg|Digg Inc. operate...|[0.03014207932238...|\n","|df662812-7f97-0b4...| Facebook|Facebook is an on...|[0.04884914168069...|\n","|b08efc27-da40-505...| Accel|Accel is an early...|[-0.0481885854493...|\n","|60485007-8856-bba...| Omnidrive|Omnidrive provide...|[0.03596876063550...|\n","|4111dc8b-c0df-2d2...| Geni|Geni is an online...|[0.02550365449860...|\n","|180ebf67-68d0-231...| Flektor|Flektor is a mash...|[0.02995767926966...|\n","|d70777cc-14bd-241...|Fox Interactive M...|Fox Interactive M...|[-0.0480363145893...|\n","|5da6106f-0d27-0d3...| Twitter|Twitter is a soci...|[0.02635572848209...|\n","|3d16cb4c-911e-75c...| StumbleUpon|StumbleUpon is a ...|[-0.0022419985875...|\n","|56b5f0c1-855f-2eb...| Gizmoz|Gizmoz develops p...|[-0.0101190275412...|\n","|21e77067-5537-408...| Scribd|Scribd is an unli...|[-0.0011103814044...|\n","|ea091a8c-40e0-060...| Slacker|Slacker, an inter...|[-0.0033981263016...|\n","|59fada33-1595-de4...| Lala|Lala, an online m...|[0.04668434034877...|\n","|56e82b22-7466-b00...| Helio|Helio, a mobile s...|[-0.0443007226388...|\n","|e56b0ceb-bb30-bbe...| eBay|eBay is an online...|[-0.0250611548544...|\n","|7ac8de1f-61b5-b1e...| Wis.dm|Wis.dm is a socia...|[-0.0065247195786...|\n","|b528bd22-cce5-c60...| MeetMoi|MeetMoi is a loca...|[0.00494436779990...|\n","|8cb7f859-e757-093...| Postini|Postini offers so...|[6.50979927740991...|\n","+--------------------+--------------------+--------------------+--------------------+\n","only showing top 20 rows\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"7gKqZ1YA51d7","executionInfo":{"status":"ok","timestamp":1617087171308,"user_tz":240,"elapsed":409,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"fbff8429-46e4-486f-c1ab-88d2c963efed"},"source":["synonyms = model.findSynonyms(\"facebook\", 20) \n","synonyms.show()"],"execution_count":43,"outputs":[{"output_type":"stream","text":["+---------------+------------------+\n","| word| similarity|\n","+---------------+------------------+\n","| myspace| 0.937709391117096|\n","|recommendations|0.9324203729629517|\n","| recorder| 0.926632821559906|\n","| chat|0.9222889542579651|\n","| spiritual|0.9150823950767517|\n","| youtube|0.9131107330322266|\n","| bloggers|0.9124341607093811|\n","| russia|0.9113275408744812|\n","| trainers|0.9111722111701965|\n","| microblogging|0.9070526957511902|\n","| personalize|0.9057231545448303|\n","| picture|0.8998890519142151|\n","| blogs|0.8977879285812378|\n","| advertisements|0.8971145749092102|\n","| voting|0.8955205678939819|\n","| manhattan|0.8942373991012573|\n","| unlike|0.8937045931816101|\n","| downloadable|0.8926480412483215|\n","| travellers|0.8923420906066895|\n","| socialize|0.8908817172050476|\n","+---------------+------------------+\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"TrM5ue8d6-mc","executionInfo":{"status":"ok","timestamp":1617088366810,"user_tz":240,"elapsed":17855,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}}},"source":["cb_sdf_w2v_final = cb_sdf_w2v.collect()"],"execution_count":45,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"i2K1icPY_eN4"},"source":["#### **Retrieve companies with descriptions similar (cosine) to input query**"]},{"cell_type":"code","metadata":{"id":"SSVOAYT-6ysd","executionInfo":{"status":"ok","timestamp":1617088369581,"user_tz":240,"elapsed":335,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}}},"source":["import numpy as np\n","\n","def cossim(v1, v2): \n"," return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / (np.sqrt(np.dot(v2, v2))+.1)"],"execution_count":46,"outputs":[]},{"cell_type":"code","metadata":{"id":"HEwvVzJ86WU6","executionInfo":{"status":"ok","timestamp":1617088422425,"user_tz":240,"elapsed":1377,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}}},"source":["query_txt = \"social media\"\n","query_df = sc.parallelize([(1,query_txt)]).toDF(['index','short_description'])\n","query_tok = regexTokFilter.transform(query_df)\n","query_swr = stopwordFilter.transform(query_tok)\n","query_vec = model.transform(query_swr)\n","query_vec = query_vec.select('wordvectors').collect()[0][0]"],"execution_count":47,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"7MP3YT6T64bI","executionInfo":{"status":"ok","timestamp":1617088552264,"user_tz":240,"elapsed":2570,"user":{"displayName":"Javid Huseynov","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgI3lj2GbKY6KJ843W-i6tMDC9YVRoUGahej0o=s64","userId":"06511685102615160965"}},"outputId":"00f0e993-dde6-4897-8e39-ce3610044e6a"},"source":["sim_rdd = sc.parallelize((i[0], i[1], i[2], float(cossim(query_vec, i[3]))) for i in cb_sdf_w2v_final)\n","sim_df = sqlContext.createDataFrame(sim_rdd).\\\n"," withColumnRenamed('_1', 'uuid').\\\n"," withColumnRenamed('_2', 'name').\\\n"," withColumnRenamed('_3', 'description').\\\n"," withColumnRenamed('_4', 'similarity').\\\n"," orderBy(\"similarity\", ascending = False)\n","sim_df.show()\n","#import pandas as pd\n","#pd.set_option('display.max_colwidth', -1)\n","#sim_df.toPandas().head(count_abstracts)"],"execution_count":48,"outputs":[{"output_type":"stream","text":["+--------------------+--------------------+--------------------+------------------+\n","| uuid| name| description| similarity|\n","+--------------------+--------------------+--------------------+------------------+\n","|f422816c-dea3-9f4...|Clear Blue Intera...| social media|0.9312872385408142|\n","|73bd4eb0-2181-1af...|Starfish Communit...| Social Media|0.9312872385408142|\n","|de0d0c0d-5aa8-64d...| Sale-Talk| social media|0.9312872385408142|\n","|0c721a0f-2133-4e7...| Crock| Social Media|0.9312872385408142|\n","|ccecc745-d321-de1...| Rallypoint|Social Media thro...|0.9063814697048753|\n","|c59cdfb5-a47e-be2...| politicsultra|Political Social ...|0.9028432822941245|\n","|70a5fb4c-8ff1-4e2...| SocialTrak.com|Social networking...|0.9019569968955012|\n","|719de690-254d-977...| e-Promo Ltd.|Social Media Mark...|0.9003543226406379|\n","|20d751cb-5ff0-223...|The Restaurant Se...|Restaurant social...|0.8943987115246731|\n","|87b36999-9bf6-af3...| NameDrop|Celebrity Social ...|0.8923769452217916|\n","|e943e425-b961-05c...| UGENmedia|Social Media Adve...| 0.884695469385349|\n","|642941aa-3393-835...|Digital Sports Ve...|Social Media for ...|0.8785493779248205|\n","|ec9fe537-f004-970...| Jockipedia| Sports Social Media|0.8785493779248205|\n","|15665a59-1c58-d9f...| Jimboland Jots|Social media comm...|0.8754932321199702|\n","|5ed5b6ed-97e4-49f...|Market and Sales ...|Advertising, Publ...|0.8751196739834982|\n","|0db106dd-d4be-4ad...| Slush Pile Reader|Literary social m...| 0.874495826654017|\n","|dd931fea-a54a-622...|Social Media Law ...| Social Media in Law| 0.865918139726867|\n","|78c9d29c-8349-f72...| Pass It On| Social Media Agency|0.8607133903766967|\n","|85a93527-ae4c-631...| iBuddz|Social Media & Ev...|0.8569506115322025|\n","|358c43d4-8889-14f...| IQPC|social media mark...|0.8551789084426622|\n","+--------------------+--------------------+--------------------+------------------+\n","only showing top 20 rows\n","\n"],"name":"stdout"}]}]}