import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as snsdata loading¶
Let’s load a dataset on rain precipitations on Seattle on 2014
# we download the file from Internet and save it
# easiest way, we can pass a URL to read_csv (or a local file)
URL = "http://www-sop.inria.fr/members/Arnaud.Legout/formationPython/Exos/Seattle2014.csv"
# don't worry, we will come back to this line when we will talk about pandas.
# for now it just load a ndarray
rainfall = pd.read_csv(URL)["PRCP"].to_numpy()
# other solution to get the remote file with urllib
# from urllib.request import urlopen
# with open("Seattle2014.csv", "w", encoding='utf-8') as f:
# with urlopen(URL) as u:
# f.write(u.read().decode('utf-8'))
# we extract with pandas the precipitation column
# rainfall is an array of precipitation per day
# for each day of 2014
# rainfall = pd.read_csv('Seattle2014.csv')['PRCP'].to_numpy()Let’s visualize¶
[assignement]: plot the amount of rain (in mm) over time; make sure you put a proper label on both axes, and on the global figure
# your code hereLet’s answer the following questions¶
What is the shape and dype of the ndarray?
# your code hereHow many rainy days?
# your code hereAverage precipitation on the year?
# your code hereAverage precipitation on the rainy days?
# your code hereMean precipitation on January?
# your code hereMean precipitation on January on the rainy days?
# your code hereA transition to pandas¶
# But in practice we don’t do that. Here is what we do…
# We start to convert to a pandas Series
s = pd.Series(rainfall)
# then we convert the index to the real dates
s.index = pd.to_datetime(s.index, unit='D',
origin=pd.Timestamp('1/1/2004'))
# possibly resample per month to get the total monthly rain
s = s.resample('m').max()---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File pandas/_libs/tslibs/offsets.pyx:6313, in pandas._libs.tslibs.offsets.to_offset()
-> 6313 'Could not get source, probably due dynamically evaluated source code.'
File pandas/_libs/tslibs/offsets.pyx:6180, in pandas._libs.tslibs.offsets._validate_to_offset_alias()
-> 6180 'Could not get source, probably due dynamically evaluated source code.'
ValueError: 'm' is no longer supported for offsets. Please use 'ME' instead.
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
Cell In[10], line 10
6 s.index = pd.to_datetime(s.index, unit='D',
7 origin=pd.Timestamp('1/1/2004'))
8
9 # possibly resample per month to get the total monthly rain
---> 10 s = s.resample('m').max()
File /__w/exos-ds/exos-ds/venv/lib/python3.14/site-packages/pandas/core/generic.py:9423, in NDFrame.resample(self, rule, closed, label, convention, on, level, origin, offset, group_keys)
9419 Freq: 17min, dtype: int64
9420 """
9421 from pandas.core.resample import get_resampler
9422
-> 9423 return get_resampler(
9424 cast("Series | DataFrame", self),
9425 freq=rule,
9426 label=label,
File /__w/exos-ds/exos-ds/venv/lib/python3.14/site-packages/pandas/core/resample.py:2334, in get_resampler(obj, **kwds)
2330 def get_resampler(obj: Series | DataFrame, **kwds) -> Resampler:
2331 """
2332 Create a TimeGrouper and return our resampler.
2333 """
-> 2334 tg = TimeGrouper(obj, **kwds) # type: ignore[arg-type]
2335 return tg._get_resampler(obj)
File /__w/exos-ds/exos-ds/venv/lib/python3.14/site-packages/pandas/core/resample.py:2420, in TimeGrouper.__init__(self, obj, freq, key, closed, label, how, fill_method, limit, convention, origin, offset, group_keys, **kwargs)
2418 freq = to_offset(freq, is_period=True)
2419 else:
-> 2420 freq = to_offset(freq)
2422 if not isinstance(freq, Tick):
2423 if offset is not None:
File pandas/_libs/tslibs/offsets.pyx:6229, in pandas._libs.tslibs.offsets.to_offset()
-> 6229 'Could not get source, probably due dynamically evaluated source code.'
File pandas/_libs/tslibs/offsets.pyx:6352, in pandas._libs.tslibs.offsets.to_offset()
-> 6352 'Could not get source, probably due dynamically evaluated source code.'
File pandas/_libs/tslibs/offsets.pyx:6137, in pandas._libs.tslibs.offsets.raise_invalid_freq()
-> 6137 'Could not get source, probably due dynamically evaluated source code.'
ValueError: Invalid frequency: m. Failed to parse with error message: ValueError("'m' is no longer supported for offsets. Please use 'ME' instead.")# then plot
%matplotlib ipympl
s.plot.bar()
plt.xlabel('month')
plt.ylabel('mm')
plt.title('Rainy days in 2014 at Seattle')
fig = plt.gcf()
fig.autofmt_xdate()
# plt.show() # if in a terminal