Nice that now python has this built in method for creating virtual environments per docs # like this python3 -m venv /path/to/new/virtual/environment python -m venv ~/.python_venvs/skpy39 source ~/.python_venvs/skpy39/bin/activate pip install scikit-learn scikit-learn pandas ipdb ipython matplotlib tqdm colormap easydev

(updated February 26, 2023) · 1 min · 39 words · Michal Piekarczyk

histogram overlays # Nice technique from https://srome.github.io/Covariate-Shift,-i.e.-Why-Prediction-Quality-Can-Degrade-In-Production-and-How-To-Fix-It/ # ... put two histograms on same plot ... def produce_overlayed_hists_for_col_dfs(col, dfs): fig = plt.figure(figsize=(12,12)) ax = fig.add_subplot(121) ax.hist(dfs[0][1][col], color='r', alpha=0.2, bins=50) ax.hist(dfs[1][1][col], color='b', alpha=0.2, bins=50) ax.set(title=f'{dfs[0][0]} (red) vs {dfs[1][0]} (blue)', ylabel=col) Basic goal looks like the below. sparse diagonal x axis ticks import matplotlib.pyplot as plt import pandas as pd import datetime def make_xtick_labels(x, step=5): '''Given x, step the labels every <step> Aka, take every <step>th x label ''' x_ticks = [i for i in range(len(x)) if i % step == 0] x_labels = [x[i] for i in x_ticks] return x_ticks, x_labels Did not add an example x , y yet, but showing an example where x contains dates and y is numeric. x = ? y = ? fig = plt.figure(figsize=(12,4)) ax = fig.add_subplot(111) ax.plot(y) x_ticks, x_labels = make_xtick_labels(x, step=20) ax.set_xticks(x_ticks) ax.set_xticklabels(x_labels, rotation=-45) fig.show() Multiple time plots and fill nulls with zeroes! Need to fill the nulls, otherwise the behavior can be weird. Here, have a df with timestamp and label , that is sparse, (meaning there are missing rows) import matplotlib.pyplot as plt import pandas as pd import datetime import random def random_df(size=500): X = [random.random() for _ in range(size)] vec = [] for (i, x) in enumerate(X): vec.extend([{ "label": ("one" if x <= 0.33 else ("two" if 0.33 < x <= 0.66 else "three")), "timestamp": datetime.date(2021, 1, 1) + datetime.timedelta(days=1*i) } for _ in range(random.randint(0, 50)) ]) return pd.DataFrame.from_records(vec) def fill_empties(statsdf): statsdf = statsdf.copy() for x in statsdf["date"].unique().tolist(): for label in statsdf.label.unique().tolist(): if statsdf[(statsdf.date == x) & (statsdf.label == label)].empty: statsdf = pd.concat([statsdf, pd.DataFrame.from_records([{"date": x, "label": label, "count": 0}])], ignore_index=True ) statsdf = statsdf.sort_values(by=["date", "label"]) return statsdf def plot_trends(df, out_loc): statsdf = df.groupby(by=['date', 'label']).size().reset_index().rename(columns={0: "count"}) statsdf = fill_empties(statsdf) fig = plt.figure(figsize=(12,4)) ax = fig.add_subplot(111) x = statsdf.date.unique().tolist() x_ticks, x_labels = make_xtick_labels(x, step=3) for label in statsdf.label.unique().tolist(): x = statsdf[statsdf.label == label]['date'].tolist() y = statsdf[statsdf.label == label]['count'].tolist() ax.plot(x, y, label=label) ax.set_xticks(x_ticks) ax.set_xticklabels(x_labels, rotation=-45) ax.legend() print('saving to ', out_loc) pylab.savefig(out_loc) pylab.close() # df = random_df(100) df["date"] = df["timestamp"].map(lambda x:x.strftime("%m-%d")) workdir = "some_folder" out_loc = f"{workdir}/trends.png" plot_trends(df, out_loc) Heatmaps are nice plt.figure(figsize=(10,10)) plt.imshow(bitmap) plt.colorbar() plt.grid(False) plt.show() using np.histogram and quantiles to spot check bimodal distributions I had this use case where I wanted to collect walltime from a service, from a dataset where a bimodal distribution was basically a given. I wanted thea mean of the second distribution. Instead of trying to use clustering analysis like dbscan which would have probably worked, I just started collecting the time series np.histogram and quantile data and I was able to visually inspect / prove that the median is a good enough statistic in this case, without too much extra data preprocessing required! sampling data from athena every 7 days , here are two examples below. supporting codes… (I didnt add code for daa.run_it but basically that just pulls data into a dataframe with a column backend_processing_time that is being used here. And make_query just makes a query for a particular date to pull that data. So nothing really special about those. They can be replaced with any particular method of gathering data.) import datetime from tqdm import tqdm d1 = datetime.date(2019, 1, 1) d2 = datetime.date(2020, 7, 1) dd = ddu.range_dates(d1, d2, 7) # outvec = [] for dt in tqdm(dd): query = make_query(dt) athenadf = daa.run_it(query, query_name='Unsaved') hist = np.histogram(athenadf.backend_processing_time.tolist(), bins=10, range=None) mean = np.mean(athenadf.backend_processing_time.tolist()) quantiles = get_quantiles(athenadf.backend_processing_time.tolist()) outvec.append({'hist': hist, 'quantiles': quantiles, 'date': dt.strftime('%Y-%m-%d'), 'mean': mean}) import numpy as np import matplotlib.pyplot as plt def get_quantiles(unsorted): data = sorted(unsorted) minimum = data[0] Q1 = np.percentile(data, 25, interpolation = 'midpoint') median = np.median(data) Q3 = np.percentile(data, 75, interpolation = 'midpoint') maximum = data[-1] return [minimum, Q1, median, Q3, maximum] def show_da_stats(bundle): H, bins = bundle['hist'] quantiles = bundle['quantiles'] # plt.plot(x[1][:-1], x[0], drawstyle='steps') #print(H, bins) #print(quantiles) plt.scatter(quantiles, [1, 1, 1, 1, 1]) plt.axvline(quantiles[1], label='q:25%') plt.axvline(quantiles[2], label='q:50%') plt.axvline(quantiles[3], label='q:75%') plt.title(f"walltime histogram at {bundle['date']}") plt.plot(bins, np.insert(H, 0, H[0]), drawstyle='steps', color='green') plt.grid(True) plt.legend() plt.show() bundle = outvec[0] show_da_stats(bundle) Nice how you can save figures from ipython if you need to import pylab import matplotlib.pyplot as plt plt.hist([1,2,3,4,1,2,3,4,1,2,1,2,2], bins=50) plt.title('Histogram blah') out_loc = '/your/location.blah.png' print('saving to ', out_loc) pylab.savefig(out_loc) pylab.close() Running this in a jupyter notebook (NOTE: this data is from one of the Keras Hello World datasets) , per below import matplotlib.pyplot as plt image = [[0, 0, 0, 0, 0, 0, 0, 0, 33, 96, 175, 156, 64, 14, 54, 137, 204, 194, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 73, 186, 177, 183, 175, 188, 232, 255, 223, 219, 194, 179, 186, 213, 146, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 35, 163, 140, 150, 152, 150, 146, 175, 175, 173, 171, 156, 152, 148, 129, 156, 140, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 150, 142, 140, 152, 160, 156, 146, 142, 127, 135, 133, 140, 140, 137, 133, 125, 169, 75, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 54, 167, 146, 129, 142, 137, 137, 131, 148, 148, 133, 131, 131, 131, 125, 140, 140, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 110, 188, 133, 146, 152, 133, 125, 127, 119, 129, 133, 119, 140, 131, 150, 14, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 221, 158, 137, 135, 123, 110, 110, 114, 108, 112, 117, 127, 142, 77, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 4, 0, 25, 158, 137, 125, 119, 119, 110, 117, 117, 110, 119, 127, 144, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 123, 156, 129, 112, 110, 102, 112, 100, 121, 117, 129, 114, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 125, 169, 127, 119, 106, 108, 104, 94, 121, 114, 129, 91, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 2, 0, 98, 171, 129, 112, 104, 114, 106, 102, 112, 104, 133, 64, 0, 4, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 2, 0, 66, 173, 135, 129, 98, 100, 119, 102, 108, 98, 135, 60, 0, 4, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 2, 0, 56, 171, 135, 127, 100, 108, 117, 85, 106, 110, 135, 66, 0, 4, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 52, 150, 129, 110, 100, 91, 102, 94, 83, 104, 123, 66, 0, 4, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 2, 0, 66, 167, 140, 148, 148, 127, 137, 152, 146, 146, 148, 96, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 45, 123, 94, 104, 96, 119, 121, 106, 98, 112, 87, 114, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 106, 89, 58, 50, 37, 50, 66, 56, 50, 75, 75, 137, 22, 0, 2, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 2, 0, 29, 148, 114, 106, 125, 89, 100, 133, 117, 131, 131, 131, 125, 112, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 100, 106, 114, 91, 137, 62, 102, 131, 89, 135, 112, 131, 108, 135, 37, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 146, 100, 108, 98, 144, 62, 106, 131, 87, 133, 104, 160, 117, 121, 68, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 33, 121, 108, 96, 100, 140, 71, 106, 127, 85, 140, 104, 150, 140, 114, 89, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 62, 119, 112, 102, 110, 137, 75, 106, 144, 81, 144, 108, 117, 154, 117, 104, 18, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 66, 121, 102, 112, 117, 131, 73, 104, 156, 77, 137, 135, 83, 179, 129, 121, 35, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 85, 127, 81, 125, 133, 119, 79, 100, 169, 83, 129, 175, 60, 163, 135, 146, 39, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 106, 129, 62, 140, 144, 108, 85, 83, 158, 85, 129, 175, 48, 146, 133, 135, 64, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 117, 119, 79, 140, 152, 102, 89, 110, 137, 96, 150, 196, 83, 144, 135, 133, 77, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 154, 121, 87, 140, 154, 112, 94, 52, 142, 100, 83, 152, 85, 160, 133, 100, 12, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 35, 4, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] plt.figure() plt.imshow(train_images[3]) plt.colorbar() plt.grid(False) plt.show() And wow that displays… And the matplot grid , wow this is cool too Example code from this tutorial According to help(plt.subplot) , plt.subplot(5, 5, i) below is an instruction to place the ith thing, within a 5x5 grid, so basically the count starts at 0 from the upper left corner and spreads the grid as if it were a tape, from 0 to 5*5 - 1 plt.figure(figsize=(10,10)) for i in range(25): plt.subplot(5,5,i+1) plt.xticks([]) plt.yticks([]) plt.grid(False) plt.imshow(train_images[i]) # , cmap=plt.cm.binary plt.xlabel(class_names[train_labels[i]]) plt.show() Obtaining image data from tensorflow import keras fashion_mnist = keras.datasets.fashion_mnist (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data() image = train_mages[3] Plot colors With tips from here, for setting the background color to green instead of transparent so that when displaying on a dark-mode page, the black axis letters are still visible instead of hidden, opaque, invisible, not seen, missing, cannot see them, or assumed cut off, etc. ...

(updated June 2, 2025) · 11 min · 2273 words · Michal Piekarczyk

concurrent.futures Recently at work I needed to add retry logic to some code that was using the concurrent python library. I had done some research and I ended up also answering this stack overflow question too in the process. I am finding concurrent.futures to be pretty nice! Of course joblib is nice too. Anyway, re-posting my answer below as well. import concurrent.futures import time import urllib from random import randint from collections import defaultdict URLS = ['http://www.foxnews.com/', 'http://www.cnn.com/', 'http://europe.wsj.com/', 'http://www.bbc.co.uk/', 'http://some-made-up-domain.com/'] URLS = [f"http://fake{i}.com" for i in range(20)] # Retrieve a single page and report the URL and contents def load_url(url, timeout): if "fake" in url: time.sleep(1) x = randint(1, 10) if x <= 5: return {"timeout": True, "error": "SimulatedTimeout", "url": url} elif x in [6, 7]: return {"error": "SomeOtherError", "url": url} else: return {"data": "<html>" + str(randint(1, 999999)) + "</html>", "url": url} try: with urllib.request.urlopen(url, timeout=timeout) as conn: data = conn.read() return {"data": data, "url": url} # except urllib.error.URLError as e: except Exception as e: if "TimeoutError" in repr(e): return {"timeout": True, "error": repr(e), "url": url} else: return {"error": repr(e), "url": url} todo = [{"url": url} for url in URLS] final_results = [] retry_counts = defaultdict(int) max_retries = 5 with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: while True: future_list = [executor.submit(load_url, item["url"], 60) for item in todo] todo = [] for future in concurrent.futures.as_completed(future_list): result = future.result() if result.get("data"): final_results.append({**result, "retries": retry_counts[result["url"]]}) elif result.get("error") and not result.get("timeout"): final_results.append({**result, "retries": retry_counts[result["url"]]}) elif result.get("timeout") and retry_counts[result["url"]] < max_retries: retry_counts[result["url"]] += 1 todo.append({"url": result["url"]}) else: final_results.append({**result, "reached_max_retries": True, "retries": retry_counts[result["url"]]}) if len(final_results) == len(URLS): print("Done!") break else: print(f"we are now {len(final_results)} out of {len(URLS)}") with the output ...

(updated February 26, 2023) · 3 min · 458 words · Michal Piekarczyk

environmental variable local injection using https://pypi.org/project/python-dotenv/ pip install -U python-dotenv Given a file like .env.test … FOO=hi from dotenv import load_dotenv, find_dotenv load_dotenv(find_dotenv(".env.test", raise_error_if_not_found=True)) import os os.getenv('FOO') # => 'hi'

(updated February 26, 2023) · 1 min · 30 words · Michal Piekarczyk

import numpy as np from bokeh.plotting import figure, show, output_file def doplot(x, y, **figure_kwargs): N = x.shape[0] radii = np.array([0.1,]*N) # print 'DEBUG, ', radii[:4], ', ', N colors = [ "#%02x%02x%02x" % (int(r), int(g), 150) for r, g in zip(50+2*x, 30+2*y) ] TOOLS="hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select," p = figure(tools=TOOLS, **figure_kwargs) p.scatter(x, y, radius=radii, fill_color=colors, fill_alpha=0.6, line_color=None) output_file("color_scatter.html") show(p) # open a browser def make_data(N=100, trials=1000, minmax=(0, 1)): a, b = minmax data = [[sum(vec), fano(vec)] for vec in [a + (b - a)*np.random.random_sample(N) for i in range(trials)]] vec1, vec2 = zip(*data) return np.array(vec1), np.array(vec2) figure_kwargs = {'x_axis_label': 'sum(X)', 'y_axis_label': 'fano(X)', 'title': 'sum(X) vs fano(X)'} doplot(*make_data(minmax=(0,1)), **figure_kwargs) ...

(updated February 26, 2023) · 1 min · 104 words · Michal Piekarczyk

import sys import time import datetime file = sys.stderr def log(logfile, tag): now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M EST') with open(logfile, 'a') as fd: fd.write(f'{now} {tag}\n') def do(minutes, logfile, tag): log(logfile, f'{tag} start') seconds = minutes*60 for i in range(seconds): file.flush() #s = str(i%60).zfill(2) file.write(f'\r{i//60}:{str(i%60).zfill(2)}') time.sleep(1) log(logfile, f'{tag} end') And choose any logfile location and any tag ..

(updated February 26, 2023) · 1 min · 56 words · Michal Piekarczyk

line profiler Big fan of the line_profiler ( formerly here ) pip install line_profiler expensive.py import time @profile def foo(): for x in range(10): bar() flarg() @profile def bar(): time.sleep(.1) @profile def flarg(): time.sleep(.1) foo() profile (pandars38) ツ kernprof -lv expensive.py Wrote profile results to expensive.py.lprof Timer unit: 1e-06 s Total time: 2.06251 s File: expensive.py Function: foo at line 3 Line # Hits Time Per Hit % Time Line Contents ============================================================== 3 @profile 4 def foo(): 5 11 54.0 4.9 0.0 for x in range(10): 6 10 1027267.0 102726.7 49.8 bar() 7 10 1035191.0 103519.1 50.2 flarg() Total time: 1.02698 s File: expensive.py Function: bar at line 9 Line # Hits Time Per Hit % Time Line Contents ============================================================== 9 @profile 10 def bar(): 11 10 1026983.0 102698.3 100.0 time.sleep(.1) Total time: 1.0349 s File: expensive.py Function: flarg at line 13 Line # Hits Time Per Hit % Time Line Contents ============================================================== 13 @profile 14 def flarg(): 15 10 1034899.0 103489.9 100.0 time.sleep(.1)

(updated February 26, 2023) · 1 min · 164 words · Michal Piekarczyk

What the what Notes from after converting a project using the 2to3, of additional gotchas TOC StringIO Pickling Uuid xrange wow the silent division bug! func.func_name calling lambdas w/ boto3 and using BytesIO Bytes and json lambda , [ERROR] Runtime.MarshalError: Unable to marshal response: b'gAN9cQAoWA4 dict merging Meat StringIO Doing this fixes things typically.. Change import StringIO to try: from StringIO import StringIO except: from io import StringIO And update any StringIO.StringIO() to just StringIO() cPickle and pickle Because theres no more cPickle I changed cPickle to pickle and started getting this 226 with open(fn) as fd: --> 227 dtypes_dict = pickle.load(fd) 228 return dtypes_dict 229 TypeError: a bytes-like object is required, not 'str' because pickled objects encoded with the string like protocol need to be re-encoded I think. But I was able to actually read the python2 ASCII pickle by doing this. Worked for me with open(fn,'rb') as fd: dtypes_dict = pickle.load(fd) Treating somedict.keys() as a list In [32]: dtypes_dict.keys()[:5] --------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-32-41046874d947> in <module> ----> 1 dtypes_dict.keys()[:5] TypeError: 'dict_keys' object is not subscriptable I think just need change to this… list(dtypes_dict.keys())[:5] uuid Got this. 43 def make_nonce(): ---> 44 return uuid.uuid4().get_hex() 45 46 def make_date_s3_prefix(timestamp): AttributeError: 'UUID' object has no attribute 'get_hex' Changed to … In [12]: uu.hex Out[12]: '19487abb29fb4e8197df6f000c31b358' xrange no more xrange. it’s now just range note per here Partition code bug This func didnt crash in python 3 but the result was quite different. def get_partitions(vec, slice_size): assert slice_size > 0 assert isinstance(vec, list) num_slices = int(math.ceil(len(vec)/slice_size)) size_remainder = len(vec) - num_slices*slice_size slices = [vec[k*slice_size:k*slice_size+slice_size] for k in range(num_slices)] if size_remainder: slices.append(vec[-(size_remainder):]) return slices python 2: as expected ids = [2220706, 2220705, 2220703, 2220700, 2220696, 2220690, 2220688, 2220687, 2220682, 2220676, 2220674, 2220671] # len(ids) # 12 get_partitions(ids, 5) # => [[2220706, 2220705, 2220703, 2220700, 2220696], [2220690, 2220688, 2220687, 2220682, 2220676], [2220674, 2220671]] python 3, wo what the heck get_partitions(ids, 5) # => [[2220706, 2220705, 2220703, 2220700, 2220696], [2220690, 2220688, 2220687, 2220682, 2220676], [2220674, 2220671], [2220700, 2220696, 2220690, 2220688, 2220687, 2220682, 2220676, 2220674, 2220671]] The file where this function exists was missing the standard from __future__ import division, absolute_import, print_function, unicode_literals line, so that’s why this happened in the first place. The fix to make this work for both python2 and python3 was to rewrite the / division as // explicitly … def get_partitions(vec, slice_size): assert slice_size > 0 assert isinstance(vec, list) num_slices = len(vec)//slice_size size_remainder = len(vec) - num_slices*slice_size slices = [vec[k*slice_size:k*slice_size+slice_size] for k in range(num_slices)] if size_remainder: slices.append(vec[-(size_remainder):]) return slices No more Func name per https://docs.python.org/3/whatsnew/3.0.html#operators-and-special-methods godamn getattr(some_func, 'func_name') to retrieve the name of a func. no longer works in python 3 that is now some_func.__name__ :grimacing: similarly some_func.func_code was renamed to some_func.__code__ Notes on reading python2 pickle in python3 Given a pandas DataFrame written like this, cPickle.dumps(df) I was able to read it in python 3 like this with open('blah.pkl', 'rb') as fd: df = pickle.load(fd, encoding='latin1') # And if having read it from s3 to a bytes object, this worked too df = pickle.loads(pkl, encoding='latin1') Noticing boto3 uses bytes now now str Before it was possible to do this import boto3 import json from StringIO import StringIO client = boto3.client('lambda') json_payload = json.dumps(payload) s = StringIO(json_payload) version = '4' response = client.invoke( FunctionName='myBlahBlahLambda', InvocationType='RequestResponse', LogType='Tail', Payload=s, Qualifier=version) out_dict = json.loads(response.get('Payload').read()) return out_dict Now that complains with TypeError: Unicode-objects must be encoded before hashing But it works to use this instead… import boto3 import json from io import BytesIO client = boto3.client('lambda') json_payload = json.dumps(payload).encode('utf-8') # <-- encode s = BytesIO(json_payload) version = '4' response = client.invoke( FunctionName='myBlahBlahLambda', InvocationType='RequestResponse', LogType='Tail', Payload=s, Qualifier=version) out_dict = json.loads(response.get('Payload').read()) return out_dict Bytes and json Relevant to data obtained with requests and base64.b64encode for example. These now produce bytes as opposed to str. "TypeError: Object of type bytes is not JSON serializable", this comes up when trying to use json.dumps . Previously strings now bytes in there, so typically need to b’blah’.decode(‘utf-8’) lambda cannot return bytes/json [ERROR] Runtime.MarshalError: Unable to marshal response: b'gAN9cQAoWA4 happening when have bytes in the response.. Dict merging interestingly the dict() vs {} behavior is different.. In [34]: dict(**{'hi': 'there'}, **{'hello': 'there'}, **{'hello': 'sailor'}) --------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-34-3bc078749ddb> in <module> ----> 1 dict(**{'hi': 'there'}, **{'hello': 'there'}, **{'hello': 'sailor'}) TypeError: type object got multiple values for keyword argument 'hello' In [36]: dict(list({'hi': 'there'}.items())+ list({'hello': 'there'}.items())+ list({'he ...: llo': 'sailor'}.items())) Out[36]: {'hi': 'there', 'hello': 'sailor'} In [37]: {**{'hi': 'there'}, **{'hello': 'there'}, **{'hello': 'sailor'}} Out[37]: {'hi': 'there', 'hello': 'sailor'} urlparse From import urlparse To from urllib.parse import urlparse

(updated February 26, 2023) · 4 min · 760 words · Michal Piekarczyk

Generate a CTE from a local csv file import pandas as pd replace_nan = lambda x: x.replace('nan', 'null') def df_to_values(df, columns=None, replace_nans=True): if columns is None: columns = df.columns.tolist() newdata = str(list(df[columns].to_records( index=False)) )[1:-1] if replace_nans: newdata = replace_nan(newdata) return newdata def cte_from_csv(localfile, colgroups, cte_names, head=False): df = pd.read_csv(localfile) if head: df = df.head() return 'with ' + ', '.join([ f''' {cte_names[i]}({', '.join(colgroups[i])}) as ( VALUES {df_to_values(df, columns=colgroups[i], replace_nans=True)} ) ''' for i, _ in enumerate(colgroups) ]) temp.csv one,two,three 1,2.,3.3 ,2.3,3.5 11,.22,.003 Example loc = "temp.csv" print( cte_from_csv(loc, [['one', 'two', 'three']], ['foo'],)) => with foo(one, two, three) as ( VALUES (1., 2., 3.3), (null, 2.3, 3.5), (11., 0.22, 0.003) ) Dollar encode def doll_df_to_values(df, cols, cols_to_dollar_encode=None): data = [tuple(x) for x in df[cols].to_records(index=False)] vec = ['(' + ', '.join([enc(x[i], cols[i] in cols_to_dollar_encode) for (i, _) in enumerate(x)]) + ')' for x in data] return ', '.join(vec) def enc(x, dollar_enc): if dollar_enc: return f'$${x}$$' elif isinstance(x, str): return f"'{x}'" elif np.isnan(x): return 'null' else: return f"{x}"

(updated February 26, 2023) · 1 min · 165 words · Michal Piekarczyk

Search and return json paths def substring_exists_lower(substring, string): # f = lambda key, term: term in key.lower() return substring.lower() in string.lower() def path_join(path, key): return f'{path}{"." if path else ""}{key}' def find_term(path, term, node, found, only_leaves=False): # must be dict or list if not ((isinstance(node, dict)) or (isinstance(node, list))): return # look in this node if isinstance(node, dict): for key in node.keys(): if substring_exists_lower(term, key): if only_leaves: if not ((isinstance(node[key], dict)) or (isinstance(node[key], list))): found.add(path_join(path, key)) else: found.add(path_join(path, key)) for key in node.keys(): #if isinstance(node[key], dict): find_term(path_join(path, key), term, node[key], found, only_leaves) if isinstance(node, list): for i, x in enumerate(node): find_term(f'{path}[{i}]', term, node[i], found, only_leaves) Example found = set() find_term('', 'name', {}, found)

(updated February 26, 2023) · 1 min · 113 words · Michal Piekarczyk