Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data loading

Let’s load a dataset on rain precipitations on Seattle on 2014

# we download the file from Internet and save it
# easiest way, we can pass a URL to read_csv (or a local file)
URL = "http://www-sop.inria.fr/members/Arnaud.Legout/formationPython/Exos/Seattle2014.csv"

# don't worry, we will come back to this line when we will talk about pandas.
# for now it just load a ndarray
rainfall = pd.read_csv(URL)["PRCP"].to_numpy()

# other solution to get the remote file with urllib
# from urllib.request import urlopen
# with open("Seattle2014.csv", "w", encoding='utf-8') as f:
#    with urlopen(URL) as u:
#        f.write(u.read().decode('utf-8'))

# we extract with pandas the precipitation column
# rainfall is an array of precipitation per day 
# for each day of 2014
# rainfall = pd.read_csv('Seattle2014.csv')['PRCP'].to_numpy()

Let’s visualize

[assignement]: plot the amount of rain (in mm) over time; make sure you put a proper label on both axes, and on the global figure

# your code here

Let’s answer the following questions

What is the shape and dype of the ndarray?

# your code here

How many rainy days?

# your code here

Average precipitation on the year?

# your code here

Average precipitation on the rainy days?

# your code here

Mean precipitation on January?

# your code here

Mean precipitation on January on the rainy days?

# your code here

A transition to pandas

# But in practice we don’t do that. Here is what we do…
# We start to convert to a pandas Series
s = pd.Series(rainfall)

# then we convert the index to the real dates
s.index = pd.to_datetime(s.index, unit='D',
                         origin=pd.Timestamp('1/1/2004'))

# possibly resample per month to get the total monthly rain
s = s.resample('m').max()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File pandas/_libs/tslibs/offsets.pyx:6313, in pandas._libs.tslibs.offsets.to_offset()
-> 6313 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/tslibs/offsets.pyx:6180, in pandas._libs.tslibs.offsets._validate_to_offset_alias()
-> 6180 'Could not get source, probably due dynamically evaluated source code.'

ValueError: 'm' is no longer supported for offsets. Please use 'ME' instead.

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
Cell In[10], line 10
      6 s.index = pd.to_datetime(s.index, unit='D',
      7                          origin=pd.Timestamp('1/1/2004'))
      8 
      9 # possibly resample per month to get the total monthly rain
---> 10 s = s.resample('m').max()

File /__w/exos-ds/exos-ds/venv/lib/python3.14/site-packages/pandas/core/generic.py:9423, in NDFrame.resample(self, rule, closed, label, convention, on, level, origin, offset, group_keys)
   9419         Freq: 17min, dtype: int64
   9420         """
   9421         from pandas.core.resample import get_resampler
   9422 
-> 9423         return get_resampler(
   9424             cast("Series | DataFrame", self),
   9425             freq=rule,
   9426             label=label,

File /__w/exos-ds/exos-ds/venv/lib/python3.14/site-packages/pandas/core/resample.py:2334, in get_resampler(obj, **kwds)
   2330 def get_resampler(obj: Series | DataFrame, **kwds) -> Resampler:
   2331     """
   2332     Create a TimeGrouper and return our resampler.
   2333     """
-> 2334     tg = TimeGrouper(obj, **kwds)  # type: ignore[arg-type]
   2335     return tg._get_resampler(obj)

File /__w/exos-ds/exos-ds/venv/lib/python3.14/site-packages/pandas/core/resample.py:2420, in TimeGrouper.__init__(self, obj, freq, key, closed, label, how, fill_method, limit, convention, origin, offset, group_keys, **kwargs)
   2418     freq = to_offset(freq, is_period=True)
   2419 else:
-> 2420     freq = to_offset(freq)
   2422 if not isinstance(freq, Tick):
   2423     if offset is not None:

File pandas/_libs/tslibs/offsets.pyx:6229, in pandas._libs.tslibs.offsets.to_offset()
-> 6229 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/tslibs/offsets.pyx:6352, in pandas._libs.tslibs.offsets.to_offset()
-> 6352 'Could not get source, probably due dynamically evaluated source code.'

File pandas/_libs/tslibs/offsets.pyx:6137, in pandas._libs.tslibs.offsets.raise_invalid_freq()
-> 6137 'Could not get source, probably due dynamically evaluated source code.'

ValueError: Invalid frequency: m. Failed to parse with error message: ValueError("'m' is no longer supported for offsets. Please use 'ME' instead.")
# then plot

%matplotlib ipympl

s.plot.bar()
plt.xlabel('month')
plt.ylabel('mm')
plt.title('Rainy days in 2014 at Seattle')
fig = plt.gcf()
fig.autofmt_xdate()
# plt.show() # if in a terminal